1;***************************************************************************** 2;* pixel.asm: x86 pixel metrics 3;***************************************************************************** 4;* Copyright (C) 2003-2013 x264 project 5;* Copyright (C) 2013-2020 MulticoreWare, Inc 6;* 7;* Authors: Loren Merritt <lorenm@u.washington.edu> 8;* Holger Lubitz <holger@lubitz.org> 9;* Laurent Aimar <fenrir@via.ecp.fr> 10;* Alex Izvorski <aizvorksi@gmail.com> 11;* Fiona Glaser <fiona@x264.com> 12;* Oskar Arvidsson <oskar@irock.se> 13;* Min Chen <chenm003@163.com> 14;* 15;* This program is free software; you can redistribute it and/or modify 16;* it under the terms of the GNU General Public License as published by 17;* the Free Software Foundation; either version 2 of the License, or 18;* (at your option) any later version. 19;* 20;* This program is distributed in the hope that it will be useful, 21;* but WITHOUT ANY WARRANTY; without even the implied warranty of 22;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 23;* GNU General Public License for more details. 24;* 25;* You should have received a copy of the GNU General Public License 26;* along with this program; if not, write to the Free Software 27;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 28;* 29;* This program is also available under a commercial proprietary license. 30;* For more information, contact us at license @ x265.com. 31;***************************************************************************** 32 33%include "x86inc.asm" 34%include "x86util.asm" 35 36SECTION_RODATA 32 37hmul_8p: times 8 db 1 38 times 4 db 1, -1 39 times 8 db 1 40 times 4 db 1, -1 41hmul_4p: times 4 db 1, 1, 1, 1, 1, -1, 1, -1 42mask_10: times 4 dw 0, -1 43mask_1100: times 2 dd 0, -1 44hmul_8w: times 4 dw 1 45 times 2 dw 1, -1 46 times 4 dw 1 47 times 2 dw 1, -1 48psy_pp_shuff1: dq 0, 1, 8, 9, 4, 5, 12, 13 49psy_pp_shuff2: dq 2, 3, 10, 11, 6, 7, 14, 15 50psy_pp_shuff3: dq 0, 0, 8, 8, 1, 1, 9, 9 51 52ALIGN 32 53transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14 54transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15 55 56SECTION .text 57 58cextern pb_0 59cextern pb_1 60cextern pw_1 61cextern pw_8 62cextern pw_16 63cextern pw_32 64cextern pw_00ff 65cextern pw_ppppmmmm 66cextern pw_ppmmppmm 67cextern pw_pmpmpmpm 68cextern pw_pmmpzzzz 69cextern pd_1 70cextern pd_2 71cextern hmul_16p 72cextern pb_movemask 73cextern pb_movemask_32 74cextern pw_pixel_max 75 76%if BIT_DEPTH == 12 77 %define SSIMRD_SHIFT 4 78%elif BIT_DEPTH == 10 79 %define SSIMRD_SHIFT 2 80%elif BIT_DEPTH == 8 81 %define SSIMRD_SHIFT 0 82%else 83 %error Unsupported BIT_DEPTH! 84%endif 85 86;============================================================================= 87; SATD 88;============================================================================= 89 90%macro JDUP 2 91%if cpuflag(sse4) 92 ; just use shufps on anything post conroe 93 shufps %1, %2, 0 94%elif cpuflag(ssse3) && notcpuflag(atom) 95 ; join 2x 32 bit and duplicate them 96 ; emulating shufps is faster on conroe 97 punpcklqdq %1, %2 98 movsldup %1, %1 99%else 100 ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d 101 punpckldq %1, %2 102%endif 103%endmacro 104 105%macro HSUMSUB 5 106 pmaddubsw m%2, m%5 107 pmaddubsw m%1, m%5 108 pmaddubsw m%4, m%5 109 pmaddubsw m%3, m%5 110%endmacro 111 112%macro DIFF_UNPACK_SSE2 5 113 punpcklbw m%1, m%5 114 punpcklbw m%2, m%5 115 punpcklbw m%3, m%5 116 punpcklbw m%4, m%5 117 psubw m%1, m%2 118 psubw m%3, m%4 119%endmacro 120 121%macro DIFF_SUMSUB_SSSE3 5 122 HSUMSUB %1, %2, %3, %4, %5 123 psubw m%1, m%2 124 psubw m%3, m%4 125%endmacro 126 127%macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer 128 movd %1, %3 129 movd %2, %4 130 JDUP %1, %2 131%endmacro 132 133%macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer 134 movddup m%3, %6 135 movddup m%4, %8 136 movddup m%1, %5 137 movddup m%2, %7 138%endmacro 139 140%macro LOAD_DUP_4x8P_PENRYN 8 141 ; penryn and nehalem run punpcklqdq and movddup in different units 142 movh m%3, %6 143 movh m%4, %8 144 punpcklqdq m%3, m%3 145 movddup m%1, %5 146 punpcklqdq m%4, m%4 147 movddup m%2, %7 148%endmacro 149 150%macro LOAD_SUMSUB_8x2P 9 151 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9 152 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 153%endmacro 154 155%macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0 156; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] 157 LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] 158 LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] 159%if %10 160 lea %8, [%8+4*r1] 161 lea %9, [%9+4*r3] 162%endif 163%endmacro 164 165%macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr 166 movddup m%1, [%7] 167 movddup m%2, [%7+8] 168 mova m%4, [%6] 169 movddup m%3, m%4 170 punpckhqdq m%4, m%4 171 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 172%endmacro 173 174%macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr 175 movu m%4, [%7] 176 mova m%2, [%6] 177 DEINTB %1, %2, %3, %4, %5 178 psubw m%1, m%3 179 psubw m%2, m%4 180 SUMSUB_BA w, %1, %2, %3 181%endmacro 182 183%macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none 184; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp] 185 LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12 186 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3 187 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3 188 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5 189%endmacro 190 191%macro LOAD_SUMSUB_16x2P_AVX2 9 192; 2*dst, 2*tmp, mul, 4*ptr 193 vbroadcasti128 m%1, [%6] 194 vbroadcasti128 m%3, [%7] 195 vbroadcasti128 m%2, [%8] 196 vbroadcasti128 m%4, [%9] 197 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 198%endmacro 199 200%macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0 201; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] 202 LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3 203 LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5 204%if %10 205 lea %8, [%8+4*r1] 206 lea %9, [%9+4*r3] 207%endif 208%endmacro 209 210%macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer 211 mova xm%3, %6 212 mova xm%4, %8 213 mova xm%1, %5 214 mova xm%2, %7 215 vpermq m%3, m%3, q0011 216 vpermq m%4, m%4, q0011 217 vpermq m%1, m%1, q0011 218 vpermq m%2, m%2, q0011 219%endmacro 220 221%macro LOAD_SUMSUB8_16x2P_AVX2 9 222; 2*dst, 2*tmp, mul, 4*ptr 223 LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9 224 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 225%endmacro 226 227%macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0 228; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] 229 LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] 230 LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] 231%if %10 232 lea %8, [%8+4*r1] 233 lea %9, [%9+4*r3] 234%endif 235%endmacro 236 237; in: r4=3*stride1, r5=3*stride2 238; in: %2 = horizontal offset 239; in: %3 = whether we need to increment pix1 and pix2 240; clobber: m3..m7 241; out: %1 = satd 242%macro SATD_4x4_MMX 3 243 %xdefine %%n nn%1 244 %assign offset %2*SIZEOF_PIXEL 245 LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset] 246 LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset] 247 LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset] 248 LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset] 249%if %3 250 lea r0, [r0+4*r1] 251 lea r2, [r2+4*r3] 252%endif 253 HADAMARD4_2D 4, 5, 6, 7, 3, %%n 254 paddw m4, m6 255;%if HIGH_BIT_DEPTH && (BIT_DEPTH == 12) 256; pxor m5, m5 257; punpcklwd m6, m4, m5 258; punpckhwd m4, m5 259; paddd m4, m6 260;%endif 261 SWAP %%n, 4 262%endmacro 263 264; in: %1 = horizontal if 0, vertical if 1 265%macro SATD_8x4_SSE 8-9 266%if %1 267 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax 268%else 269 HADAMARD4_V %2, %3, %4, %5, %6 270 ; doing the abs first is a slight advantage 271 ABSW2 m%2, m%4, m%2, m%4, m%6, m%7 272 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7 273 HADAMARD 1, max, %2, %4, %6, %7 274%endif 275%ifnidn %9, swap 276 %if (BIT_DEPTH == 12) 277 pxor m%6, m%6 278 punpcklwd m%7, m%2, m%6 279 punpckhwd m%2, m%6 280 paddd m%8, m%7 281 paddd m%8, m%2 282 %else 283 paddw m%8, m%2 284 %endif 285%else 286 SWAP %8, %2 287 %if (BIT_DEPTH == 12) 288 pxor m%6, m%6 289 punpcklwd m%7, m%8, m%6 290 punpckhwd m%8, m%6 291 paddd m%8, m%7 292 %endif 293%endif 294%if %1 295 %if (BIT_DEPTH == 12) 296 pxor m%6, m%6 297 punpcklwd m%7, m%4, m%6 298 punpckhwd m%4, m%6 299 paddd m%8, m%7 300 paddd m%8, m%4 301 %else 302 paddw m%8, m%4 303 %endif 304%else 305 HADAMARD 1, max, %3, %5, %6, %7 306 %if (BIT_DEPTH == 12) 307 pxor m%6, m%6 308 punpcklwd m%7, m%3, m%6 309 punpckhwd m%3, m%6 310 paddd m%8, m%7 311 paddd m%8, m%3 312 %else 313 paddw m%8, m%3 314 %endif 315%endif 316%endmacro 317 318%macro SATD_8x4_1_SSE 10 319%if %1 320 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax 321%else 322 HADAMARD4_V %2, %3, %4, %5, %6 323 ; doing the abs first is a slight advantage 324 ABSW2 m%2, m%4, m%2, m%4, m%6, m%7 325 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7 326 HADAMARD 1, max, %2, %4, %6, %7 327%endif 328 329 pxor m%10, m%10 330 punpcklwd m%9, m%2, m%10 331 paddd m%8, m%9 332 punpckhwd m%9, m%2, m%10 333 paddd m%8, m%9 334 335%if %1 336 pxor m%10, m%10 337 punpcklwd m%9, m%4, m%10 338 paddd m%8, m%9 339 punpckhwd m%9, m%4, m%10 340 paddd m%8, m%9 341%else 342 HADAMARD 1, max, %3, %5, %6, %7 343 pxor m%10, m%10 344 punpcklwd m%9, m%3, m%10 345 paddd m%8, m%9 346 punpckhwd m%9, m%3, m%10 347 paddd m%8, m%9 348%endif 349%endmacro 350 351%macro SATD_START_MMX 0 352 FIX_STRIDES r1, r3 353 lea r4, [3*r1] ; 3*stride1 354 lea r5, [3*r3] ; 3*stride2 355%endmacro 356 357%macro SATD_END_MMX 0 358%if HIGH_BIT_DEPTH 359 HADDUW m0, m1 360 movd eax, m0 361%else ; !HIGH_BIT_DEPTH 362 pshufw m1, m0, q1032 363 paddw m0, m1 364 pshufw m1, m0, q2301 365 paddw m0, m1 366 movd eax, m0 367 and eax, 0xffff 368%endif ; HIGH_BIT_DEPTH 369 EMMS 370 RET 371%endmacro 372 373%macro SSIM_DIST_HIGH 2 374 vpsrld m6, m0, SSIMRD_SHIFT 375 vpsubd m0, m1 376 377 vpmuldq m2, m0, m0 378 vpsrldq m0, m0, 4 379 vpmuldq m0, m0, m0 380 vpaddq m0, m2 381 382 vpmuldq m2, m6, m6 383 vpsrldq m6, m6, 4 384 vpmuldq m6, m6, m6 385 vpaddq m6, m2 386 387 vpaddq m4, m0 388 vpaddq m7, m6 389%endmacro 390 391%macro NORM_FACT_HIGH 1 392 vpsrld m1, m0, SSIMRD_SHIFT 393 vpmuldq m2, m1, m1 394 vpsrldq m1, m1, 4 395 vpmuldq m1, m1, m1 396 397 vpaddq m1, m2 398 vpaddq m3, m1 399%endmacro 400 401%macro SSIM_DIST_LOW 2 402 vpsrlw m6, m0, SSIMRD_SHIFT 403 vpsubw m0, m1 404 405 vpmaddwd m0, m0, m0 406 vpmaddwd m6, m6, m6 407 408 vpaddd m4, m0 409 vpaddd m7, m6 410%endmacro 411 412%macro NORM_FACT_LOW 1 413 vpsrlw m1, m0, SSIMRD_SHIFT 414 vpmaddwd m1, m1, m1 415 vpaddd m3, m1 416%endmacro 417 418; FIXME avoid the spilling of regs to hold 3*stride. 419; for small blocks on x86_32, modify pixel pointer instead. 420 421;----------------------------------------------------------------------------- 422; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) 423;----------------------------------------------------------------------------- 424INIT_MMX mmx2 425cglobal pixel_satd_4x4, 4,6 426 SATD_START_MMX 427 SATD_4x4_MMX m0, 0, 0 428 SATD_END_MMX 429 430%macro SATD_START_SSE2 2-3 0 431 FIX_STRIDES r1, r3 432%if HIGH_BIT_DEPTH && %3 433 pxor %2, %2 434%elif cpuflag(ssse3) && notcpuflag(atom) 435%if mmsize==32 436 mova %2, [hmul_16p] 437%else 438 mova %2, [hmul_8p] 439%endif 440%endif 441 lea r4, [3*r1] 442 lea r5, [3*r3] 443 pxor %1, %1 444%endmacro 445 446%macro SATD_END_SSE2 1-2 447%if HIGH_BIT_DEPTH 448 %if BIT_DEPTH == 12 449 HADDD %1, xm0 450 %else ; BIT_DEPTH == 12 451 HADDUW %1, xm0 452 %endif ; BIT_DEPTH == 12 453 %if %0 == 2 454 paddd %1, %2 455 %endif 456%else 457 HADDW %1, xm7 458%endif 459 movd eax, %1 460 RET 461%endmacro 462 463%macro SATD_ACCUM 3 464%if HIGH_BIT_DEPTH 465 HADDUW %1, %2 466 paddd %3, %1 467 pxor %1, %1 468%endif 469%endmacro 470 471%macro BACKUP_POINTERS 0 472%if ARCH_X86_64 473%if WIN64 474 PUSH r7 475%endif 476 mov r6, r0 477 mov r7, r2 478%endif 479%endmacro 480 481%macro RESTORE_AND_INC_POINTERS 0 482%if ARCH_X86_64 483 lea r0, [r6+8*SIZEOF_PIXEL] 484 lea r2, [r7+8*SIZEOF_PIXEL] 485%if WIN64 486 POP r7 487%endif 488%else 489 mov r0, r0mp 490 mov r2, r2mp 491 add r0, 8*SIZEOF_PIXEL 492 add r2, 8*SIZEOF_PIXEL 493%endif 494%endmacro 495 496%macro SATD_4x8_SSE 3-4 497%if HIGH_BIT_DEPTH 498 movh m0, [r0+0*r1] 499 movh m4, [r2+0*r3] 500 movh m1, [r0+1*r1] 501 movh m5, [r2+1*r3] 502 movhps m0, [r0+4*r1] 503 movhps m4, [r2+4*r3] 504 movh m2, [r0+2*r1] 505 movh m6, [r2+2*r3] 506 psubw m0, m4 507 movh m3, [r0+r4] 508 movh m4, [r2+r5] 509 lea r0, [r0+4*r1] 510 lea r2, [r2+4*r3] 511 movhps m1, [r0+1*r1] 512 movhps m5, [r2+1*r3] 513 movhps m2, [r0+2*r1] 514 movhps m6, [r2+2*r3] 515 psubw m1, m5 516 movhps m3, [r0+r4] 517 movhps m4, [r2+r5] 518 psubw m2, m6 519 psubw m3, m4 520%else ; !HIGH_BIT_DEPTH 521 movd m4, [r2] 522 movd m5, [r2+r3] 523 movd m6, [r2+2*r3] 524 add r2, r5 525 movd m0, [r0] 526 movd m1, [r0+r1] 527 movd m2, [r0+2*r1] 528 add r0, r4 529 movd m3, [r2+r3] 530 JDUP m4, m3 531 movd m3, [r0+r1] 532 JDUP m0, m3 533 movd m3, [r2+2*r3] 534 JDUP m5, m3 535 movd m3, [r0+2*r1] 536 JDUP m1, m3 537%if %1==0 && %2==1 538 mova m3, [hmul_4p] 539 DIFFOP 0, 4, 1, 5, 3 540%else 541 DIFFOP 0, 4, 1, 5, 7 542%endif 543 movd m5, [r2] 544 add r2, r5 545 movd m3, [r0] 546 add r0, r4 547 movd m4, [r2] 548 JDUP m6, m4 549 movd m4, [r0] 550 JDUP m2, m4 551 movd m4, [r2+r3] 552 JDUP m5, m4 553 movd m4, [r0+r1] 554 JDUP m3, m4 555%if %1==0 && %2==1 556 mova m4, [hmul_4p] 557 DIFFOP 2, 6, 3, 5, 4 558%else 559 DIFFOP 2, 6, 3, 5, 7 560%endif 561%endif ; HIGH_BIT_DEPTH 562%if %0 == 4 563 SATD_8x4_1_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3, %4 564%else 565 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3 566%endif 567%endmacro 568 569;----------------------------------------------------------------------------- 570; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t ) 571;----------------------------------------------------------------------------- 572%macro SATDS_SSE2 0 573%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) 574 575%if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH) 576cglobal pixel_satd_4x4, 4, 6, 6 577 SATD_START_MMX 578 mova m4, [hmul_4p] 579 LOAD_DUP_2x4P m2, m5, [r2], [r2+r3] 580 LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5] 581 LOAD_DUP_2x4P m0, m5, [r0], [r0+r1] 582 LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4] 583 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 584 HADAMARD 0, sumsub, 0, 1, 2, 3 585 HADAMARD 4, sumsub, 0, 1, 2, 3 586 HADAMARD 1, amax, 0, 1, 2, 3 587 HADDW m0, m1 588 movd eax, m0 589 RET 590%endif 591 592cglobal pixel_satd_4x8, 4, 6, 8 593 SATD_START_MMX 594%if vertical==0 595 mova m7, [hmul_4p] 596%endif 597 SATD_4x8_SSE vertical, 0, swap 598%if BIT_DEPTH == 12 599 HADDD m7, m1 600%else 601 HADDUW m7, m1 602%endif 603 movd eax, m7 604 RET 605 606cglobal pixel_satd_4x16, 4, 6, 8 607 SATD_START_MMX 608%if vertical==0 609 mova m7, [hmul_4p] 610%endif 611 SATD_4x8_SSE vertical, 0, swap 612 lea r0, [r0+r1*2*SIZEOF_PIXEL] 613 lea r2, [r2+r3*2*SIZEOF_PIXEL] 614 SATD_4x8_SSE vertical, 1, add 615%if BIT_DEPTH == 12 616 HADDD m7, m1 617%else 618 HADDUW m7, m1 619%endif 620 movd eax, m7 621 RET 622 623cglobal pixel_satd_8x8_internal 624 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 625 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 626%%pixel_satd_8x4_internal: 627 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 628 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 629 ret 630 631cglobal pixel_satd_8x8_internal2 632%if WIN64 633 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 634 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13 635%%pixel_satd_8x4_internal2: 636 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 637 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13 638%else 639 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 640 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5 641%%pixel_satd_8x4_internal2: 642 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 643 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5 644%endif 645 ret 646 647; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers) 648; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge) 649%if HIGH_BIT_DEPTH == 0 && (WIN64 || UNIX64) && notcpuflag(avx) 650 651cglobal pixel_satd_16x4_internal2 652 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11 653 lea r2, [r2+4*r3] 654 lea r0, [r0+4*r1] 655 SATD_8x4_1_SSE 0, 0, 1, 2, 3, 6, 11, 10, 12, 13 656 SATD_8x4_1_SSE 0, 4, 8, 5, 9, 6, 3, 10, 12, 13 657 ret 658 659cglobal pixel_satd_16x4, 4,6,14 660 SATD_START_SSE2 m10, m7 661%if vertical 662 mova m7, [pw_00ff] 663%endif 664 call pixel_satd_16x4_internal2 665 HADDD m10, m0 666 movd eax, m10 667 RET 668 669cglobal pixel_satd_16x8, 4,6,14 670 SATD_START_SSE2 m10, m7 671%if vertical 672 mova m7, [pw_00ff] 673%endif 674 jmp %%pixel_satd_16x8_internal 675 676cglobal pixel_satd_16x12, 4,6,14 677 SATD_START_SSE2 m10, m7 678%if vertical 679 mova m7, [pw_00ff] 680%endif 681 call pixel_satd_16x4_internal2 682 jmp %%pixel_satd_16x8_internal 683 684cglobal pixel_satd_16x32, 4,6,14 685 SATD_START_SSE2 m10, m7 686%if vertical 687 mova m7, [pw_00ff] 688%endif 689 call pixel_satd_16x4_internal2 690 call pixel_satd_16x4_internal2 691 call pixel_satd_16x4_internal2 692 call pixel_satd_16x4_internal2 693 call pixel_satd_16x4_internal2 694 call pixel_satd_16x4_internal2 695 jmp %%pixel_satd_16x8_internal 696 697cglobal pixel_satd_16x64, 4,6,14 698 SATD_START_SSE2 m10, m7 699%if vertical 700 mova m7, [pw_00ff] 701%endif 702 call pixel_satd_16x4_internal2 703 call pixel_satd_16x4_internal2 704 call pixel_satd_16x4_internal2 705 call pixel_satd_16x4_internal2 706 call pixel_satd_16x4_internal2 707 call pixel_satd_16x4_internal2 708 call pixel_satd_16x4_internal2 709 call pixel_satd_16x4_internal2 710 call pixel_satd_16x4_internal2 711 call pixel_satd_16x4_internal2 712 call pixel_satd_16x4_internal2 713 call pixel_satd_16x4_internal2 714 call pixel_satd_16x4_internal2 715 call pixel_satd_16x4_internal2 716 jmp %%pixel_satd_16x8_internal 717 718cglobal pixel_satd_16x16, 4,6,14 719 SATD_START_SSE2 m10, m7 720%if vertical 721 mova m7, [pw_00ff] 722%endif 723 call pixel_satd_16x4_internal2 724 call pixel_satd_16x4_internal2 725%%pixel_satd_16x8_internal: 726 call pixel_satd_16x4_internal2 727 call pixel_satd_16x4_internal2 728 HADDD m10, m0 729 movd eax, m10 730 RET 731 732cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && notcpuflag(avx) 733 SATD_START_SSE2 m10, m7 734 mov r6, r0 735 mov r7, r2 736%if vertical 737 mova m7, [pw_00ff] 738%endif 739 call pixel_satd_16x4_internal2 740 call pixel_satd_16x4_internal2 741 lea r0, [r6 + 16] 742 lea r2, [r7 + 16] 743 call pixel_satd_16x4_internal2 744 call pixel_satd_16x4_internal2 745 HADDD m10, m0 746 movd eax, m10 747 RET 748 749cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && notcpuflag(avx) 750 SATD_START_SSE2 m10, m7 751 mov r6, r0 752 mov r7, r2 753%if vertical 754 mova m7, [pw_00ff] 755%endif 756 call pixel_satd_16x4_internal2 757 call pixel_satd_16x4_internal2 758 call pixel_satd_16x4_internal2 759 call pixel_satd_16x4_internal2 760 lea r0, [r6 + 16] 761 lea r2, [r7 + 16] 762 call pixel_satd_16x4_internal2 763 call pixel_satd_16x4_internal2 764 call pixel_satd_16x4_internal2 765 call pixel_satd_16x4_internal2 766 HADDD m10, m0 767 movd eax, m10 768 RET 769 770cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && notcpuflag(avx) 771 SATD_START_SSE2 m10, m7 772 mov r6, r0 773 mov r7, r2 774%if vertical 775 mova m7, [pw_00ff] 776%endif 777 call pixel_satd_16x4_internal2 778 call pixel_satd_16x4_internal2 779 call pixel_satd_16x4_internal2 780 call pixel_satd_16x4_internal2 781 call pixel_satd_16x4_internal2 782 call pixel_satd_16x4_internal2 783 lea r0, [r6 + 16] 784 lea r2, [r7 + 16] 785 call pixel_satd_16x4_internal2 786 call pixel_satd_16x4_internal2 787 call pixel_satd_16x4_internal2 788 call pixel_satd_16x4_internal2 789 call pixel_satd_16x4_internal2 790 call pixel_satd_16x4_internal2 791 HADDD m10, m0 792 movd eax, m10 793 RET 794 795cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && notcpuflag(avx) 796 SATD_START_SSE2 m10, m7 797 mov r6, r0 798 mov r7, r2 799%if vertical 800 mova m7, [pw_00ff] 801%endif 802 call pixel_satd_16x4_internal2 803 call pixel_satd_16x4_internal2 804 call pixel_satd_16x4_internal2 805 call pixel_satd_16x4_internal2 806 call pixel_satd_16x4_internal2 807 call pixel_satd_16x4_internal2 808 call pixel_satd_16x4_internal2 809 call pixel_satd_16x4_internal2 810 lea r0, [r6 + 16] 811 lea r2, [r7 + 16] 812 call pixel_satd_16x4_internal2 813 call pixel_satd_16x4_internal2 814 call pixel_satd_16x4_internal2 815 call pixel_satd_16x4_internal2 816 call pixel_satd_16x4_internal2 817 call pixel_satd_16x4_internal2 818 call pixel_satd_16x4_internal2 819 call pixel_satd_16x4_internal2 820 HADDD m10, m0 821 movd eax, m10 822 RET 823 824cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && notcpuflag(avx) 825 SATD_START_SSE2 m10, m7 826 mov r6, r0 827 mov r7, r2 828%if vertical 829 mova m7, [pw_00ff] 830%endif 831 call pixel_satd_16x4_internal2 832 call pixel_satd_16x4_internal2 833 call pixel_satd_16x4_internal2 834 call pixel_satd_16x4_internal2 835 call pixel_satd_16x4_internal2 836 call pixel_satd_16x4_internal2 837 call pixel_satd_16x4_internal2 838 call pixel_satd_16x4_internal2 839 call pixel_satd_16x4_internal2 840 call pixel_satd_16x4_internal2 841 call pixel_satd_16x4_internal2 842 call pixel_satd_16x4_internal2 843 call pixel_satd_16x4_internal2 844 call pixel_satd_16x4_internal2 845 call pixel_satd_16x4_internal2 846 call pixel_satd_16x4_internal2 847 lea r0, [r6 + 16] 848 lea r2, [r7 + 16] 849 call pixel_satd_16x4_internal2 850 call pixel_satd_16x4_internal2 851 call pixel_satd_16x4_internal2 852 call pixel_satd_16x4_internal2 853 call pixel_satd_16x4_internal2 854 call pixel_satd_16x4_internal2 855 call pixel_satd_16x4_internal2 856 call pixel_satd_16x4_internal2 857 call pixel_satd_16x4_internal2 858 call pixel_satd_16x4_internal2 859 call pixel_satd_16x4_internal2 860 call pixel_satd_16x4_internal2 861 call pixel_satd_16x4_internal2 862 call pixel_satd_16x4_internal2 863 call pixel_satd_16x4_internal2 864 call pixel_satd_16x4_internal2 865 HADDD m10, m0 866 movd eax, m10 867 RET 868 869cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && notcpuflag(avx) 870 SATD_START_SSE2 m10, m7 871 mov r6, r0 872 mov r7, r2 873%if vertical 874 mova m7, [pw_00ff] 875%endif 876 call pixel_satd_16x4_internal2 877 call pixel_satd_16x4_internal2 878 call pixel_satd_16x4_internal2 879 call pixel_satd_16x4_internal2 880 call pixel_satd_16x4_internal2 881 call pixel_satd_16x4_internal2 882 call pixel_satd_16x4_internal2 883 call pixel_satd_16x4_internal2 884 call pixel_satd_16x4_internal2 885 call pixel_satd_16x4_internal2 886 call pixel_satd_16x4_internal2 887 call pixel_satd_16x4_internal2 888 call pixel_satd_16x4_internal2 889 call pixel_satd_16x4_internal2 890 call pixel_satd_16x4_internal2 891 call pixel_satd_16x4_internal2 892 lea r0, [r6 + 16] 893 lea r2, [r7 + 16] 894 call pixel_satd_16x4_internal2 895 call pixel_satd_16x4_internal2 896 call pixel_satd_16x4_internal2 897 call pixel_satd_16x4_internal2 898 call pixel_satd_16x4_internal2 899 call pixel_satd_16x4_internal2 900 call pixel_satd_16x4_internal2 901 call pixel_satd_16x4_internal2 902 call pixel_satd_16x4_internal2 903 call pixel_satd_16x4_internal2 904 call pixel_satd_16x4_internal2 905 call pixel_satd_16x4_internal2 906 call pixel_satd_16x4_internal2 907 call pixel_satd_16x4_internal2 908 call pixel_satd_16x4_internal2 909 call pixel_satd_16x4_internal2 910 lea r0, [r6 + 32] 911 lea r2, [r7 + 32] 912 call pixel_satd_16x4_internal2 913 call pixel_satd_16x4_internal2 914 call pixel_satd_16x4_internal2 915 call pixel_satd_16x4_internal2 916 call pixel_satd_16x4_internal2 917 call pixel_satd_16x4_internal2 918 call pixel_satd_16x4_internal2 919 call pixel_satd_16x4_internal2 920 call pixel_satd_16x4_internal2 921 call pixel_satd_16x4_internal2 922 call pixel_satd_16x4_internal2 923 call pixel_satd_16x4_internal2 924 call pixel_satd_16x4_internal2 925 call pixel_satd_16x4_internal2 926 call pixel_satd_16x4_internal2 927 call pixel_satd_16x4_internal2 928 HADDD m10, m0 929 movd eax, m10 930 RET 931 932cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && notcpuflag(avx) 933 SATD_START_SSE2 m10, m7 934 mov r6, r0 935 mov r7, r2 936%if vertical 937 mova m7, [pw_00ff] 938%endif 939 call pixel_satd_16x4_internal2 940 call pixel_satd_16x4_internal2 941 call pixel_satd_16x4_internal2 942 call pixel_satd_16x4_internal2 943 lea r0, [r6 + 16] 944 lea r2, [r7 + 16] 945 call pixel_satd_16x4_internal2 946 call pixel_satd_16x4_internal2 947 call pixel_satd_16x4_internal2 948 call pixel_satd_16x4_internal2 949 lea r0, [r6 + 32] 950 lea r2, [r7 + 32] 951 call pixel_satd_16x4_internal2 952 call pixel_satd_16x4_internal2 953 call pixel_satd_16x4_internal2 954 call pixel_satd_16x4_internal2 955 lea r0, [r6 + 48] 956 lea r2, [r7 + 48] 957 call pixel_satd_16x4_internal2 958 call pixel_satd_16x4_internal2 959 call pixel_satd_16x4_internal2 960 call pixel_satd_16x4_internal2 961 HADDD m10, m0 962 movd eax, m10 963 RET 964 965cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && notcpuflag(avx) 966 SATD_START_SSE2 m10, m7 967 mov r6, r0 968 mov r7, r2 969%if vertical 970 mova m7, [pw_00ff] 971%endif 972 call pixel_satd_16x4_internal2 973 call pixel_satd_16x4_internal2 974 call pixel_satd_16x4_internal2 975 call pixel_satd_16x4_internal2 976 call pixel_satd_16x4_internal2 977 call pixel_satd_16x4_internal2 978 call pixel_satd_16x4_internal2 979 call pixel_satd_16x4_internal2 980 lea r0, [r6 + 16] 981 lea r2, [r7 + 16] 982 call pixel_satd_16x4_internal2 983 call pixel_satd_16x4_internal2 984 call pixel_satd_16x4_internal2 985 call pixel_satd_16x4_internal2 986 call pixel_satd_16x4_internal2 987 call pixel_satd_16x4_internal2 988 call pixel_satd_16x4_internal2 989 call pixel_satd_16x4_internal2 990 lea r0, [r6 + 32] 991 lea r2, [r7 + 32] 992 call pixel_satd_16x4_internal2 993 call pixel_satd_16x4_internal2 994 call pixel_satd_16x4_internal2 995 call pixel_satd_16x4_internal2 996 call pixel_satd_16x4_internal2 997 call pixel_satd_16x4_internal2 998 call pixel_satd_16x4_internal2 999 call pixel_satd_16x4_internal2 1000 lea r0, [r6 + 48] 1001 lea r2, [r7 + 48] 1002 call pixel_satd_16x4_internal2 1003 call pixel_satd_16x4_internal2 1004 call pixel_satd_16x4_internal2 1005 call pixel_satd_16x4_internal2 1006 call pixel_satd_16x4_internal2 1007 call pixel_satd_16x4_internal2 1008 call pixel_satd_16x4_internal2 1009 call pixel_satd_16x4_internal2 1010 1011 HADDD m10, m0 1012 movd eax, m10 1013 RET 1014 1015cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && notcpuflag(avx) 1016 SATD_START_SSE2 m10, m7 1017 mov r6, r0 1018 mov r7, r2 1019%if vertical 1020 mova m7, [pw_00ff] 1021%endif 1022 call pixel_satd_16x4_internal2 1023 call pixel_satd_16x4_internal2 1024 call pixel_satd_16x4_internal2 1025 call pixel_satd_16x4_internal2 1026 call pixel_satd_16x4_internal2 1027 call pixel_satd_16x4_internal2 1028 call pixel_satd_16x4_internal2 1029 call pixel_satd_16x4_internal2 1030 call pixel_satd_16x4_internal2 1031 call pixel_satd_16x4_internal2 1032 call pixel_satd_16x4_internal2 1033 call pixel_satd_16x4_internal2 1034 lea r0, [r6 + 16] 1035 lea r2, [r7 + 16] 1036 call pixel_satd_16x4_internal2 1037 call pixel_satd_16x4_internal2 1038 call pixel_satd_16x4_internal2 1039 call pixel_satd_16x4_internal2 1040 call pixel_satd_16x4_internal2 1041 call pixel_satd_16x4_internal2 1042 call pixel_satd_16x4_internal2 1043 call pixel_satd_16x4_internal2 1044 call pixel_satd_16x4_internal2 1045 call pixel_satd_16x4_internal2 1046 call pixel_satd_16x4_internal2 1047 call pixel_satd_16x4_internal2 1048 lea r0, [r6 + 32] 1049 lea r2, [r7 + 32] 1050 call pixel_satd_16x4_internal2 1051 call pixel_satd_16x4_internal2 1052 call pixel_satd_16x4_internal2 1053 call pixel_satd_16x4_internal2 1054 call pixel_satd_16x4_internal2 1055 call pixel_satd_16x4_internal2 1056 call pixel_satd_16x4_internal2 1057 call pixel_satd_16x4_internal2 1058 call pixel_satd_16x4_internal2 1059 call pixel_satd_16x4_internal2 1060 call pixel_satd_16x4_internal2 1061 call pixel_satd_16x4_internal2 1062 lea r0, [r6 + 48] 1063 lea r2, [r7 + 48] 1064 call pixel_satd_16x4_internal2 1065 call pixel_satd_16x4_internal2 1066 call pixel_satd_16x4_internal2 1067 call pixel_satd_16x4_internal2 1068 call pixel_satd_16x4_internal2 1069 call pixel_satd_16x4_internal2 1070 call pixel_satd_16x4_internal2 1071 call pixel_satd_16x4_internal2 1072 call pixel_satd_16x4_internal2 1073 call pixel_satd_16x4_internal2 1074 call pixel_satd_16x4_internal2 1075 call pixel_satd_16x4_internal2 1076 1077 HADDD m10, m0 1078 movd eax, m10 1079 RET 1080 1081cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && notcpuflag(avx) 1082 SATD_START_SSE2 m10, m7 1083 mov r6, r0 1084 mov r7, r2 1085%if vertical 1086 mova m7, [pw_00ff] 1087%endif 1088 call pixel_satd_16x4_internal2 1089 call pixel_satd_16x4_internal2 1090 call pixel_satd_16x4_internal2 1091 call pixel_satd_16x4_internal2 1092 call pixel_satd_16x4_internal2 1093 call pixel_satd_16x4_internal2 1094 call pixel_satd_16x4_internal2 1095 call pixel_satd_16x4_internal2 1096 call pixel_satd_16x4_internal2 1097 call pixel_satd_16x4_internal2 1098 call pixel_satd_16x4_internal2 1099 call pixel_satd_16x4_internal2 1100 call pixel_satd_16x4_internal2 1101 call pixel_satd_16x4_internal2 1102 call pixel_satd_16x4_internal2 1103 call pixel_satd_16x4_internal2 1104 lea r0, [r6 + 16] 1105 lea r2, [r7 + 16] 1106 call pixel_satd_16x4_internal2 1107 call pixel_satd_16x4_internal2 1108 call pixel_satd_16x4_internal2 1109 call pixel_satd_16x4_internal2 1110 call pixel_satd_16x4_internal2 1111 call pixel_satd_16x4_internal2 1112 call pixel_satd_16x4_internal2 1113 call pixel_satd_16x4_internal2 1114 call pixel_satd_16x4_internal2 1115 call pixel_satd_16x4_internal2 1116 call pixel_satd_16x4_internal2 1117 call pixel_satd_16x4_internal2 1118 call pixel_satd_16x4_internal2 1119 call pixel_satd_16x4_internal2 1120 call pixel_satd_16x4_internal2 1121 call pixel_satd_16x4_internal2 1122 lea r0, [r6 + 32] 1123 lea r2, [r7 + 32] 1124 call pixel_satd_16x4_internal2 1125 call pixel_satd_16x4_internal2 1126 call pixel_satd_16x4_internal2 1127 call pixel_satd_16x4_internal2 1128 call pixel_satd_16x4_internal2 1129 call pixel_satd_16x4_internal2 1130 call pixel_satd_16x4_internal2 1131 call pixel_satd_16x4_internal2 1132 call pixel_satd_16x4_internal2 1133 call pixel_satd_16x4_internal2 1134 call pixel_satd_16x4_internal2 1135 call pixel_satd_16x4_internal2 1136 call pixel_satd_16x4_internal2 1137 call pixel_satd_16x4_internal2 1138 call pixel_satd_16x4_internal2 1139 call pixel_satd_16x4_internal2 1140 lea r0, [r6 + 48] 1141 lea r2, [r7 + 48] 1142 call pixel_satd_16x4_internal2 1143 call pixel_satd_16x4_internal2 1144 call pixel_satd_16x4_internal2 1145 call pixel_satd_16x4_internal2 1146 call pixel_satd_16x4_internal2 1147 call pixel_satd_16x4_internal2 1148 call pixel_satd_16x4_internal2 1149 call pixel_satd_16x4_internal2 1150 call pixel_satd_16x4_internal2 1151 call pixel_satd_16x4_internal2 1152 call pixel_satd_16x4_internal2 1153 call pixel_satd_16x4_internal2 1154 call pixel_satd_16x4_internal2 1155 call pixel_satd_16x4_internal2 1156 call pixel_satd_16x4_internal2 1157 call pixel_satd_16x4_internal2 1158 1159 HADDD m10, m0 1160 movd eax, m10 1161 RET 1162 1163%else 1164%if WIN64 1165cglobal pixel_satd_16x24, 4,8,14 ;if WIN64 && cpuflag(avx) 1166 SATD_START_SSE2 m6, m7 1167 mov r6, r0 1168 mov r7, r2 1169 call pixel_satd_8x8_internal2 1170 call pixel_satd_8x8_internal2 1171 call pixel_satd_8x8_internal2 1172 lea r0, [r6 + 8*SIZEOF_PIXEL] 1173 lea r2, [r7 + 8*SIZEOF_PIXEL] 1174 call pixel_satd_8x8_internal2 1175 call pixel_satd_8x8_internal2 1176 call pixel_satd_8x8_internal2 1177 HADDD m6, m0 1178 movd eax, m6 1179 RET 1180%else 1181cglobal pixel_satd_16x24, 4,7,8,0-gprsize ;if !WIN64 1182 SATD_START_SSE2 m6, m7 1183 mov r6, r0 1184 mov [rsp], r2 1185 call pixel_satd_8x8_internal2 1186 call pixel_satd_8x8_internal2 1187 call pixel_satd_8x8_internal2 1188 lea r0, [r6 + 8*SIZEOF_PIXEL] 1189 mov r2, [rsp] 1190 add r2, 8*SIZEOF_PIXEL 1191 call pixel_satd_8x8_internal2 1192 call pixel_satd_8x8_internal2 1193 call pixel_satd_8x8_internal2 1194 HADDD m6, m0 1195 movd eax, m6 1196 RET 1197%endif 1198%if WIN64 1199cglobal pixel_satd_32x48, 4,8,14 ;if WIN64 && cpuflag(avx) 1200 SATD_START_SSE2 m6, m7 1201 mov r6, r0 1202 mov r7, r2 1203 call pixel_satd_8x8_internal2 1204 call pixel_satd_8x8_internal2 1205 call pixel_satd_8x8_internal2 1206 call pixel_satd_8x8_internal2 1207 call pixel_satd_8x8_internal2 1208 call pixel_satd_8x8_internal2 1209 lea r0, [r6 + 8*SIZEOF_PIXEL] 1210 lea r2, [r7 + 8*SIZEOF_PIXEL] 1211 call pixel_satd_8x8_internal2 1212 call pixel_satd_8x8_internal2 1213 call pixel_satd_8x8_internal2 1214 call pixel_satd_8x8_internal2 1215 call pixel_satd_8x8_internal2 1216 call pixel_satd_8x8_internal2 1217 lea r0, [r6 + 16*SIZEOF_PIXEL] 1218 lea r2, [r7 + 16*SIZEOF_PIXEL] 1219 call pixel_satd_8x8_internal2 1220 call pixel_satd_8x8_internal2 1221 call pixel_satd_8x8_internal2 1222 call pixel_satd_8x8_internal2 1223 call pixel_satd_8x8_internal2 1224 call pixel_satd_8x8_internal2 1225 lea r0, [r6 + 24*SIZEOF_PIXEL] 1226 lea r2, [r7 + 24*SIZEOF_PIXEL] 1227 call pixel_satd_8x8_internal2 1228 call pixel_satd_8x8_internal2 1229 call pixel_satd_8x8_internal2 1230 call pixel_satd_8x8_internal2 1231 call pixel_satd_8x8_internal2 1232 call pixel_satd_8x8_internal2 1233 HADDD m6, m0 1234 movd eax, m6 1235 RET 1236%else 1237cglobal pixel_satd_32x48, 4,7,8,0-gprsize ;if !WIN64 1238 SATD_START_SSE2 m6, m7 1239 mov r6, r0 1240 mov [rsp], r2 1241 call pixel_satd_8x8_internal2 1242 call pixel_satd_8x8_internal2 1243 call pixel_satd_8x8_internal2 1244 call pixel_satd_8x8_internal2 1245 call pixel_satd_8x8_internal2 1246 call pixel_satd_8x8_internal2 1247 lea r0, [r6 + 8*SIZEOF_PIXEL] 1248 mov r2, [rsp] 1249 add r2, 8*SIZEOF_PIXEL 1250 call pixel_satd_8x8_internal2 1251 call pixel_satd_8x8_internal2 1252 call pixel_satd_8x8_internal2 1253 call pixel_satd_8x8_internal2 1254 call pixel_satd_8x8_internal2 1255 call pixel_satd_8x8_internal2 1256 lea r0, [r6 + 16*SIZEOF_PIXEL] 1257 mov r2, [rsp] 1258 add r2, 16*SIZEOF_PIXEL 1259 call pixel_satd_8x8_internal2 1260 call pixel_satd_8x8_internal2 1261 call pixel_satd_8x8_internal2 1262 call pixel_satd_8x8_internal2 1263 call pixel_satd_8x8_internal2 1264 call pixel_satd_8x8_internal2 1265 lea r0, [r6 + 24*SIZEOF_PIXEL] 1266 mov r2, [rsp] 1267 add r2, 24*SIZEOF_PIXEL 1268 call pixel_satd_8x8_internal2 1269 call pixel_satd_8x8_internal2 1270 call pixel_satd_8x8_internal2 1271 call pixel_satd_8x8_internal2 1272 call pixel_satd_8x8_internal2 1273 call pixel_satd_8x8_internal2 1274 HADDD m6, m0 1275 movd eax, m6 1276 RET 1277%endif 1278 1279%if WIN64 1280cglobal pixel_satd_24x64, 4,8,14 ;if WIN64 && cpuflag(avx) 1281 SATD_START_SSE2 m6, m7 1282 mov r6, r0 1283 mov r7, r2 1284 call pixel_satd_8x8_internal2 1285 call pixel_satd_8x8_internal2 1286 call pixel_satd_8x8_internal2 1287 call pixel_satd_8x8_internal2 1288 call pixel_satd_8x8_internal2 1289 call pixel_satd_8x8_internal2 1290 call pixel_satd_8x8_internal2 1291 call pixel_satd_8x8_internal2 1292 lea r0, [r6 + 8*SIZEOF_PIXEL] 1293 lea r2, [r7 + 8*SIZEOF_PIXEL] 1294 call pixel_satd_8x8_internal2 1295 call pixel_satd_8x8_internal2 1296 call pixel_satd_8x8_internal2 1297 call pixel_satd_8x8_internal2 1298 call pixel_satd_8x8_internal2 1299 call pixel_satd_8x8_internal2 1300 call pixel_satd_8x8_internal2 1301 call pixel_satd_8x8_internal2 1302 lea r0, [r6 + 16*SIZEOF_PIXEL] 1303 lea r2, [r7 + 16*SIZEOF_PIXEL] 1304 call pixel_satd_8x8_internal2 1305 call pixel_satd_8x8_internal2 1306 call pixel_satd_8x8_internal2 1307 call pixel_satd_8x8_internal2 1308 call pixel_satd_8x8_internal2 1309 call pixel_satd_8x8_internal2 1310 call pixel_satd_8x8_internal2 1311 call pixel_satd_8x8_internal2 1312 HADDD m6, m0 1313 movd eax, m6 1314 RET 1315%else 1316cglobal pixel_satd_24x64, 4,7,8,0-gprsize ;if !WIN64 1317 SATD_START_SSE2 m6, m7 1318 mov r6, r0 1319 mov [rsp], r2 1320 call pixel_satd_8x8_internal2 1321 call pixel_satd_8x8_internal2 1322 call pixel_satd_8x8_internal2 1323 call pixel_satd_8x8_internal2 1324 call pixel_satd_8x8_internal2 1325 call pixel_satd_8x8_internal2 1326 call pixel_satd_8x8_internal2 1327 call pixel_satd_8x8_internal2 1328 lea r0, [r6 + 8*SIZEOF_PIXEL] 1329 mov r2, [rsp] 1330 add r2, 8*SIZEOF_PIXEL 1331 call pixel_satd_8x8_internal2 1332 call pixel_satd_8x8_internal2 1333 call pixel_satd_8x8_internal2 1334 call pixel_satd_8x8_internal2 1335 call pixel_satd_8x8_internal2 1336 call pixel_satd_8x8_internal2 1337 call pixel_satd_8x8_internal2 1338 call pixel_satd_8x8_internal2 1339 lea r0, [r6 + 16*SIZEOF_PIXEL] 1340 mov r2, [rsp] 1341 add r2, 16*SIZEOF_PIXEL 1342 call pixel_satd_8x8_internal2 1343 call pixel_satd_8x8_internal2 1344 call pixel_satd_8x8_internal2 1345 call pixel_satd_8x8_internal2 1346 call pixel_satd_8x8_internal2 1347 call pixel_satd_8x8_internal2 1348 call pixel_satd_8x8_internal2 1349 call pixel_satd_8x8_internal2 1350 HADDD m6, m0 1351 movd eax, m6 1352 RET 1353%endif 1354 1355%if WIN64 1356cglobal pixel_satd_8x64, 4,8,14 ;if WIN64 && cpuflag(avx) 1357 SATD_START_SSE2 m6, m7 1358 mov r6, r0 1359 mov r7, r2 1360 call pixel_satd_8x8_internal2 1361 call pixel_satd_8x8_internal2 1362 call pixel_satd_8x8_internal2 1363 call pixel_satd_8x8_internal2 1364 call pixel_satd_8x8_internal2 1365 call pixel_satd_8x8_internal2 1366 call pixel_satd_8x8_internal2 1367 call pixel_satd_8x8_internal2 1368 HADDD m6, m0 1369 movd eax, m6 1370 RET 1371%else 1372cglobal pixel_satd_8x64, 4,7,8,0-gprsize ;if !WIN64 1373 SATD_START_SSE2 m6, m7 1374 mov r6, r0 1375 mov [rsp], r2 1376 call pixel_satd_8x8_internal2 1377 call pixel_satd_8x8_internal2 1378 call pixel_satd_8x8_internal2 1379 call pixel_satd_8x8_internal2 1380 call pixel_satd_8x8_internal2 1381 call pixel_satd_8x8_internal2 1382 call pixel_satd_8x8_internal2 1383 call pixel_satd_8x8_internal2 1384 HADDD m6, m0 1385 movd eax, m6 1386 RET 1387%endif 1388 1389%if WIN64 1390cglobal pixel_satd_8x12, 4,8,14 ;if WIN64 && cpuflag(avx) 1391 SATD_START_SSE2 m6, m7 1392 mov r6, r0 1393 mov r7, r2 1394 call pixel_satd_8x8_internal2 1395 call %%pixel_satd_8x4_internal2 1396 pxor m7, m7 1397 movhlps m7, m6 1398 paddd m6, m7 1399 pshufd m7, m6, 1 1400 paddd m6, m7 1401 movd eax, m6 1402 RET 1403%else 1404cglobal pixel_satd_8x12, 4,7,8,0-gprsize ;if !WIN64 1405 SATD_START_SSE2 m6, m7 1406 mov r6, r0 1407 mov [rsp], r2 1408 call pixel_satd_8x8_internal2 1409 call %%pixel_satd_8x4_internal2 1410 HADDD m6, m0 1411 movd eax, m6 1412 RET 1413%endif 1414 1415%if HIGH_BIT_DEPTH 1416%if WIN64 1417cglobal pixel_satd_12x32, 4,8,8 ;if WIN64 && cpuflag(avx) 1418 SATD_START_MMX 1419 mov r6, r0 1420 mov r7, r2 1421 pxor m7, m7 1422 SATD_4x8_SSE vertical, 0, 4, 5 1423 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1424 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1425 SATD_4x8_SSE vertical, 1, 4, 5 1426 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1427 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1428 SATD_4x8_SSE vertical, 1, 4, 5 1429 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1430 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1431 SATD_4x8_SSE vertical, 1, 4, 5 1432 lea r0, [r6 + 4*SIZEOF_PIXEL] 1433 lea r2, [r7 + 4*SIZEOF_PIXEL] 1434 SATD_4x8_SSE vertical, 1, 4, 5 1435 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1436 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1437 SATD_4x8_SSE vertical, 1, 4, 5 1438 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1439 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1440 SATD_4x8_SSE vertical, 1, 4, 5 1441 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1442 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1443 SATD_4x8_SSE vertical, 1, 4, 5 1444 lea r0, [r6 + 8*SIZEOF_PIXEL] 1445 lea r2, [r7 + 8*SIZEOF_PIXEL] 1446 SATD_4x8_SSE vertical, 1, 4, 5 1447 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1448 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1449 SATD_4x8_SSE vertical, 1, 4, 5 1450 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1451 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1452 SATD_4x8_SSE vertical, 1, 4, 5 1453 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1454 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1455 SATD_4x8_SSE vertical, 1, 4, 5 1456 HADDD m7, m0 1457 movd eax, m7 1458 RET 1459%else 1460cglobal pixel_satd_12x32, 4,7,8,0-gprsize 1461 SATD_START_MMX 1462 mov r6, r0 1463 mov [rsp], r2 1464 pxor m7, m7 1465 SATD_4x8_SSE vertical, 0, 4, 5 1466 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1467 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1468 SATD_4x8_SSE vertical, 1, 4, 5 1469 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1470 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1471 SATD_4x8_SSE vertical, 1, 4, 5 1472 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1473 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1474 SATD_4x8_SSE vertical, 1, 4, 5 1475 lea r0, [r6 + 4*SIZEOF_PIXEL] 1476 mov r2, [rsp] 1477 add r2, 4*SIZEOF_PIXEL 1478 SATD_4x8_SSE vertical, 1, 4, 5 1479 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1480 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1481 SATD_4x8_SSE vertical, 1, 4, 5 1482 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1483 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1484 SATD_4x8_SSE vertical, 1, 4, 5 1485 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1486 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1487 SATD_4x8_SSE vertical, 1, 4, 5 1488 lea r0, [r6 + 8*SIZEOF_PIXEL] 1489 mov r2, [rsp] 1490 add r2, 8*SIZEOF_PIXEL 1491 SATD_4x8_SSE vertical, 1, 4, 5 1492 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1493 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1494 SATD_4x8_SSE vertical, 1, 4, 5 1495 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1496 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1497 SATD_4x8_SSE vertical, 1, 4, 5 1498 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1499 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1500 SATD_4x8_SSE vertical, 1, 4, 5 1501 HADDD m7, m0 1502 movd eax, m7 1503 RET 1504%endif 1505%else ;HIGH_BIT_DEPTH 1506%if WIN64 1507cglobal pixel_satd_12x32, 4,8,8 ;if WIN64 && cpuflag(avx) 1508 SATD_START_MMX 1509 mov r6, r0 1510 mov r7, r2 1511%if vertical==0 1512 mova m7, [hmul_4p] 1513%endif 1514 SATD_4x8_SSE vertical, 0, swap 1515 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1516 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1517 SATD_4x8_SSE vertical, 1, add 1518 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1519 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1520 SATD_4x8_SSE vertical, 1, add 1521 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1522 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1523 SATD_4x8_SSE vertical, 1, add 1524 lea r0, [r6 + 4*SIZEOF_PIXEL] 1525 lea r2, [r7 + 4*SIZEOF_PIXEL] 1526 SATD_4x8_SSE vertical, 1, add 1527 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1528 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1529 SATD_4x8_SSE vertical, 1, add 1530 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1531 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1532 SATD_4x8_SSE vertical, 1, add 1533 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1534 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1535 SATD_4x8_SSE vertical, 1, add 1536 lea r0, [r6 + 8*SIZEOF_PIXEL] 1537 lea r2, [r7 + 8*SIZEOF_PIXEL] 1538 SATD_4x8_SSE vertical, 1, add 1539 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1540 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1541 SATD_4x8_SSE vertical, 1, add 1542 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1543 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1544 SATD_4x8_SSE vertical, 1, add 1545 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1546 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1547 SATD_4x8_SSE vertical, 1, add 1548 HADDW m7, m1 1549 movd eax, m7 1550 RET 1551%else 1552cglobal pixel_satd_12x32, 4,7,8,0-gprsize 1553 SATD_START_MMX 1554 mov r6, r0 1555 mov [rsp], r2 1556%if vertical==0 1557 mova m7, [hmul_4p] 1558%endif 1559 SATD_4x8_SSE vertical, 0, swap 1560 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1561 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1562 SATD_4x8_SSE vertical, 1, add 1563 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1564 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1565 SATD_4x8_SSE vertical, 1, add 1566 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1567 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1568 SATD_4x8_SSE vertical, 1, add 1569 lea r0, [r6 + 4*SIZEOF_PIXEL] 1570 mov r2, [rsp] 1571 add r2, 4*SIZEOF_PIXEL 1572 SATD_4x8_SSE vertical, 1, add 1573 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1574 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1575 SATD_4x8_SSE vertical, 1, add 1576 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1577 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1578 SATD_4x8_SSE vertical, 1, add 1579 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1580 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1581 SATD_4x8_SSE vertical, 1, add 1582 lea r0, [r6 + 8*SIZEOF_PIXEL] 1583 mov r2, [rsp] 1584 add r2, 8*SIZEOF_PIXEL 1585 SATD_4x8_SSE vertical, 1, add 1586 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1587 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1588 SATD_4x8_SSE vertical, 1, add 1589 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1590 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1591 SATD_4x8_SSE vertical, 1, add 1592 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1593 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1594 SATD_4x8_SSE vertical, 1, add 1595 HADDW m7, m1 1596 movd eax, m7 1597 RET 1598%endif 1599%endif 1600 1601%if HIGH_BIT_DEPTH 1602%if WIN64 1603cglobal pixel_satd_4x32, 4,8,8 ;if WIN64 && cpuflag(avx) 1604 SATD_START_MMX 1605 mov r6, r0 1606 mov r7, r2 1607 pxor m7, m7 1608 SATD_4x8_SSE vertical, 0, 4, 5 1609 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1610 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1611 SATD_4x8_SSE vertical, 1, 4, 5 1612 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1613 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1614 SATD_4x8_SSE vertical, 1, 4, 5 1615 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1616 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1617 SATD_4x8_SSE vertical, 1, 4, 5 1618 HADDD m7, m0 1619 movd eax, m7 1620 RET 1621%else 1622cglobal pixel_satd_4x32, 4,7,8,0-gprsize 1623 SATD_START_MMX 1624 mov r6, r0 1625 mov [rsp], r2 1626 pxor m7, m7 1627 SATD_4x8_SSE vertical, 0, 4, 5 1628 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1629 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1630 SATD_4x8_SSE vertical, 1, 4, 5 1631 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1632 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1633 SATD_4x8_SSE vertical, 1, 4, 5 1634 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1635 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1636 SATD_4x8_SSE vertical, 1, 4, 5 1637 pxor m1, m1 1638 movhlps m1, m7 1639 paddd m7, m1 1640 pshufd m1, m7, 1 1641 paddd m7, m1 1642 movd eax, m7 1643 RET 1644%endif 1645%else 1646%if WIN64 1647cglobal pixel_satd_4x32, 4,8,8 ;if WIN64 && cpuflag(avx) 1648 SATD_START_MMX 1649 mov r6, r0 1650 mov r7, r2 1651%if vertical==0 1652 mova m7, [hmul_4p] 1653%endif 1654 SATD_4x8_SSE vertical, 0, swap 1655 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1656 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1657 SATD_4x8_SSE vertical, 1, add 1658 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1659 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1660 SATD_4x8_SSE vertical, 1, add 1661 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1662 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1663 SATD_4x8_SSE vertical, 1, add 1664 HADDW m7, m1 1665 movd eax, m7 1666 RET 1667%else 1668cglobal pixel_satd_4x32, 4,7,8,0-gprsize 1669 SATD_START_MMX 1670 mov r6, r0 1671 mov [rsp], r2 1672%if vertical==0 1673 mova m7, [hmul_4p] 1674%endif 1675 SATD_4x8_SSE vertical, 0, swap 1676 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1677 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1678 SATD_4x8_SSE vertical, 1, add 1679 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1680 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1681 SATD_4x8_SSE vertical, 1, add 1682 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 1683 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 1684 SATD_4x8_SSE vertical, 1, add 1685 HADDW m7, m1 1686 movd eax, m7 1687 RET 1688%endif 1689%endif 1690 1691%if WIN64 1692cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && cpuflag(avx) 1693 SATD_START_SSE2 m6, m7 1694 mov r6, r0 1695 mov r7, r2 1696 call pixel_satd_8x8_internal2 1697 lea r0, [r6 + 8*SIZEOF_PIXEL] 1698 lea r2, [r7 + 8*SIZEOF_PIXEL] 1699 call pixel_satd_8x8_internal2 1700 lea r0, [r6 + 16*SIZEOF_PIXEL] 1701 lea r2, [r7 + 16*SIZEOF_PIXEL] 1702 call pixel_satd_8x8_internal2 1703 lea r0, [r6 + 24*SIZEOF_PIXEL] 1704 lea r2, [r7 + 24*SIZEOF_PIXEL] 1705 call pixel_satd_8x8_internal2 1706 HADDD m6, m0 1707 movd eax, m6 1708 RET 1709%else 1710cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64 1711 SATD_START_SSE2 m6, m7 1712 mov r6, r0 1713 mov [rsp], r2 1714 call pixel_satd_8x8_internal2 1715 lea r0, [r6 + 8*SIZEOF_PIXEL] 1716 mov r2, [rsp] 1717 add r2, 8*SIZEOF_PIXEL 1718 call pixel_satd_8x8_internal2 1719 lea r0, [r6 + 16*SIZEOF_PIXEL] 1720 mov r2, [rsp] 1721 add r2, 16*SIZEOF_PIXEL 1722 call pixel_satd_8x8_internal2 1723 lea r0, [r6 + 24*SIZEOF_PIXEL] 1724 mov r2, [rsp] 1725 add r2, 24*SIZEOF_PIXEL 1726 call pixel_satd_8x8_internal2 1727 HADDD m6, m0 1728 movd eax, m6 1729 RET 1730%endif 1731 1732%if WIN64 1733cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && cpuflag(avx) 1734 SATD_START_SSE2 m6, m7 1735 mov r6, r0 1736 mov r7, r2 1737 call pixel_satd_8x8_internal2 1738 call pixel_satd_8x8_internal2 1739 lea r0, [r6 + 8*SIZEOF_PIXEL] 1740 lea r2, [r7 + 8*SIZEOF_PIXEL] 1741 call pixel_satd_8x8_internal2 1742 call pixel_satd_8x8_internal2 1743 lea r0, [r6 + 16*SIZEOF_PIXEL] 1744 lea r2, [r7 + 16*SIZEOF_PIXEL] 1745 call pixel_satd_8x8_internal2 1746 call pixel_satd_8x8_internal2 1747 lea r0, [r6 + 24*SIZEOF_PIXEL] 1748 lea r2, [r7 + 24*SIZEOF_PIXEL] 1749 call pixel_satd_8x8_internal2 1750 call pixel_satd_8x8_internal2 1751 HADDD m6, m0 1752 movd eax, m6 1753 RET 1754%else 1755cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64 1756 SATD_START_SSE2 m6, m7 1757 mov r6, r0 1758 mov [rsp], r2 1759 call pixel_satd_8x8_internal2 1760 call pixel_satd_8x8_internal2 1761 lea r0, [r6 + 8*SIZEOF_PIXEL] 1762 mov r2, [rsp] 1763 add r2, 8*SIZEOF_PIXEL 1764 call pixel_satd_8x8_internal2 1765 call pixel_satd_8x8_internal2 1766 lea r0, [r6 + 16*SIZEOF_PIXEL] 1767 mov r2, [rsp] 1768 add r2, 16*SIZEOF_PIXEL 1769 call pixel_satd_8x8_internal2 1770 call pixel_satd_8x8_internal2 1771 lea r0, [r6 + 24*SIZEOF_PIXEL] 1772 mov r2, [rsp] 1773 add r2, 24*SIZEOF_PIXEL 1774 call pixel_satd_8x8_internal2 1775 call pixel_satd_8x8_internal2 1776 HADDD m6, m0 1777 movd eax, m6 1778 RET 1779%endif 1780 1781%if WIN64 1782cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && cpuflag(avx) 1783 SATD_START_SSE2 m6, m7 1784 mov r6, r0 1785 mov r7, r2 1786 call pixel_satd_8x8_internal2 1787 call pixel_satd_8x8_internal2 1788 call pixel_satd_8x8_internal2 1789 lea r0, [r6 + 8*SIZEOF_PIXEL] 1790 lea r2, [r7 + 8*SIZEOF_PIXEL] 1791 call pixel_satd_8x8_internal2 1792 call pixel_satd_8x8_internal2 1793 call pixel_satd_8x8_internal2 1794 lea r0, [r6 + 16*SIZEOF_PIXEL] 1795 lea r2, [r7 + 16*SIZEOF_PIXEL] 1796 call pixel_satd_8x8_internal2 1797 call pixel_satd_8x8_internal2 1798 call pixel_satd_8x8_internal2 1799 lea r0, [r6 + 24*SIZEOF_PIXEL] 1800 lea r2, [r7 + 24*SIZEOF_PIXEL] 1801 call pixel_satd_8x8_internal2 1802 call pixel_satd_8x8_internal2 1803 call pixel_satd_8x8_internal2 1804 HADDD m6, m0 1805 movd eax, m6 1806 RET 1807%else 1808cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64 1809 SATD_START_SSE2 m6, m7 1810 mov r6, r0 1811 mov [rsp], r2 1812 call pixel_satd_8x8_internal2 1813 call pixel_satd_8x8_internal2 1814 call pixel_satd_8x8_internal2 1815 lea r0, [r6 + 8*SIZEOF_PIXEL] 1816 mov r2, [rsp] 1817 add r2, 8*SIZEOF_PIXEL 1818 call pixel_satd_8x8_internal2 1819 call pixel_satd_8x8_internal2 1820 call pixel_satd_8x8_internal2 1821 lea r0, [r6 + 16*SIZEOF_PIXEL] 1822 mov r2, [rsp] 1823 add r2, 16*SIZEOF_PIXEL 1824 call pixel_satd_8x8_internal2 1825 call pixel_satd_8x8_internal2 1826 call pixel_satd_8x8_internal2 1827 lea r0, [r6 + 24*SIZEOF_PIXEL] 1828 mov r2, [rsp] 1829 add r2, 24*SIZEOF_PIXEL 1830 call pixel_satd_8x8_internal2 1831 call pixel_satd_8x8_internal2 1832 call pixel_satd_8x8_internal2 1833 HADDD m6, m0 1834 movd eax, m6 1835 RET 1836%endif 1837 1838%if WIN64 1839cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && cpuflag(avx) 1840 SATD_START_SSE2 m6, m7 1841 mov r6, r0 1842 mov r7, r2 1843 call pixel_satd_8x8_internal2 1844 call pixel_satd_8x8_internal2 1845 call pixel_satd_8x8_internal2 1846 call pixel_satd_8x8_internal2 1847 lea r0, [r6 + 8*SIZEOF_PIXEL] 1848 lea r2, [r7 + 8*SIZEOF_PIXEL] 1849 call pixel_satd_8x8_internal2 1850 call pixel_satd_8x8_internal2 1851 call pixel_satd_8x8_internal2 1852 call pixel_satd_8x8_internal2 1853 lea r0, [r6 + 16*SIZEOF_PIXEL] 1854 lea r2, [r7 + 16*SIZEOF_PIXEL] 1855 call pixel_satd_8x8_internal2 1856 call pixel_satd_8x8_internal2 1857 call pixel_satd_8x8_internal2 1858 call pixel_satd_8x8_internal2 1859 lea r0, [r6 + 24*SIZEOF_PIXEL] 1860 lea r2, [r7 + 24*SIZEOF_PIXEL] 1861 call pixel_satd_8x8_internal2 1862 call pixel_satd_8x8_internal2 1863 call pixel_satd_8x8_internal2 1864 call pixel_satd_8x8_internal2 1865 HADDD m6, m0 1866 movd eax, m6 1867 RET 1868%else 1869cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64 1870 SATD_START_SSE2 m6, m7 1871 mov r6, r0 1872 mov [rsp], r2 1873 call pixel_satd_8x8_internal2 1874 call pixel_satd_8x8_internal2 1875 call pixel_satd_8x8_internal2 1876 call pixel_satd_8x8_internal2 1877 lea r0, [r6 + 8*SIZEOF_PIXEL] 1878 mov r2, [rsp] 1879 add r2, 8*SIZEOF_PIXEL 1880 call pixel_satd_8x8_internal2 1881 call pixel_satd_8x8_internal2 1882 call pixel_satd_8x8_internal2 1883 call pixel_satd_8x8_internal2 1884 lea r0, [r6 + 16*SIZEOF_PIXEL] 1885 mov r2, [rsp] 1886 add r2, 16*SIZEOF_PIXEL 1887 call pixel_satd_8x8_internal2 1888 call pixel_satd_8x8_internal2 1889 call pixel_satd_8x8_internal2 1890 call pixel_satd_8x8_internal2 1891 lea r0, [r6 + 24*SIZEOF_PIXEL] 1892 mov r2, [rsp] 1893 add r2, 24*SIZEOF_PIXEL 1894 call pixel_satd_8x8_internal2 1895 call pixel_satd_8x8_internal2 1896 call pixel_satd_8x8_internal2 1897 call pixel_satd_8x8_internal2 1898 HADDD m6, m0 1899 movd eax, m6 1900 RET 1901%endif 1902 1903%if WIN64 1904cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && cpuflag(avx) 1905 SATD_START_SSE2 m6, m7 1906 mov r6, r0 1907 mov r7, r2 1908 call pixel_satd_8x8_internal2 1909 call pixel_satd_8x8_internal2 1910 call pixel_satd_8x8_internal2 1911 call pixel_satd_8x8_internal2 1912 call pixel_satd_8x8_internal2 1913 call pixel_satd_8x8_internal2 1914 call pixel_satd_8x8_internal2 1915 call pixel_satd_8x8_internal2 1916 lea r0, [r6 + 8*SIZEOF_PIXEL] 1917 lea r2, [r7 + 8*SIZEOF_PIXEL] 1918 call pixel_satd_8x8_internal2 1919 call pixel_satd_8x8_internal2 1920 call pixel_satd_8x8_internal2 1921 call pixel_satd_8x8_internal2 1922 call pixel_satd_8x8_internal2 1923 call pixel_satd_8x8_internal2 1924 call pixel_satd_8x8_internal2 1925 call pixel_satd_8x8_internal2 1926 lea r0, [r6 + 16*SIZEOF_PIXEL] 1927 lea r2, [r7 + 16*SIZEOF_PIXEL] 1928 call pixel_satd_8x8_internal2 1929 call pixel_satd_8x8_internal2 1930 call pixel_satd_8x8_internal2 1931 call pixel_satd_8x8_internal2 1932 call pixel_satd_8x8_internal2 1933 call pixel_satd_8x8_internal2 1934 call pixel_satd_8x8_internal2 1935 call pixel_satd_8x8_internal2 1936 lea r0, [r6 + 24*SIZEOF_PIXEL] 1937 lea r2, [r7 + 24*SIZEOF_PIXEL] 1938 call pixel_satd_8x8_internal2 1939 call pixel_satd_8x8_internal2 1940 call pixel_satd_8x8_internal2 1941 call pixel_satd_8x8_internal2 1942 call pixel_satd_8x8_internal2 1943 call pixel_satd_8x8_internal2 1944 call pixel_satd_8x8_internal2 1945 call pixel_satd_8x8_internal2 1946 HADDD m6, m0 1947 movd eax, m6 1948 RET 1949%else 1950cglobal pixel_satd_32x64, 4,7,8,0-gprsize ;if !WIN64 1951 SATD_START_SSE2 m6, m7 1952 mov r6, r0 1953 mov [rsp], r2 1954 call pixel_satd_8x8_internal2 1955 call pixel_satd_8x8_internal2 1956 call pixel_satd_8x8_internal2 1957 call pixel_satd_8x8_internal2 1958 call pixel_satd_8x8_internal2 1959 call pixel_satd_8x8_internal2 1960 call pixel_satd_8x8_internal2 1961 call pixel_satd_8x8_internal2 1962 lea r0, [r6 + 8*SIZEOF_PIXEL] 1963 mov r2, [rsp] 1964 add r2, 8*SIZEOF_PIXEL 1965 call pixel_satd_8x8_internal2 1966 call pixel_satd_8x8_internal2 1967 call pixel_satd_8x8_internal2 1968 call pixel_satd_8x8_internal2 1969 call pixel_satd_8x8_internal2 1970 call pixel_satd_8x8_internal2 1971 call pixel_satd_8x8_internal2 1972 call pixel_satd_8x8_internal2 1973 lea r0, [r6 + 16*SIZEOF_PIXEL] 1974 mov r2, [rsp] 1975 add r2, 16*SIZEOF_PIXEL 1976 call pixel_satd_8x8_internal2 1977 call pixel_satd_8x8_internal2 1978 call pixel_satd_8x8_internal2 1979 call pixel_satd_8x8_internal2 1980 call pixel_satd_8x8_internal2 1981 call pixel_satd_8x8_internal2 1982 call pixel_satd_8x8_internal2 1983 call pixel_satd_8x8_internal2 1984 lea r0, [r6 + 24*SIZEOF_PIXEL] 1985 mov r2, [rsp] 1986 add r2, 24*SIZEOF_PIXEL 1987 call pixel_satd_8x8_internal2 1988 call pixel_satd_8x8_internal2 1989 call pixel_satd_8x8_internal2 1990 call pixel_satd_8x8_internal2 1991 call pixel_satd_8x8_internal2 1992 call pixel_satd_8x8_internal2 1993 call pixel_satd_8x8_internal2 1994 call pixel_satd_8x8_internal2 1995 HADDD m6, m0 1996 movd eax, m6 1997 RET 1998%endif 1999 2000%if WIN64 2001cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && cpuflag(avx) 2002 SATD_START_SSE2 m6, m7 2003 mov r6, r0 2004 mov r7, r2 2005 call pixel_satd_8x8_internal2 2006 call pixel_satd_8x8_internal2 2007 call pixel_satd_8x8_internal2 2008 call pixel_satd_8x8_internal2 2009 call pixel_satd_8x8_internal2 2010 call pixel_satd_8x8_internal2 2011 call pixel_satd_8x8_internal2 2012 call pixel_satd_8x8_internal2 2013 lea r0, [r6 + 8*SIZEOF_PIXEL] 2014 lea r2, [r7 + 8*SIZEOF_PIXEL] 2015 call pixel_satd_8x8_internal2 2016 call pixel_satd_8x8_internal2 2017 call pixel_satd_8x8_internal2 2018 call pixel_satd_8x8_internal2 2019 call pixel_satd_8x8_internal2 2020 call pixel_satd_8x8_internal2 2021 call pixel_satd_8x8_internal2 2022 call pixel_satd_8x8_internal2 2023 lea r0, [r6 + 16*SIZEOF_PIXEL] 2024 lea r2, [r7 + 16*SIZEOF_PIXEL] 2025 call pixel_satd_8x8_internal2 2026 call pixel_satd_8x8_internal2 2027 call pixel_satd_8x8_internal2 2028 call pixel_satd_8x8_internal2 2029 call pixel_satd_8x8_internal2 2030 call pixel_satd_8x8_internal2 2031 call pixel_satd_8x8_internal2 2032 call pixel_satd_8x8_internal2 2033 lea r0, [r6 + 24*SIZEOF_PIXEL] 2034 lea r2, [r7 + 24*SIZEOF_PIXEL] 2035 call pixel_satd_8x8_internal2 2036 call pixel_satd_8x8_internal2 2037 call pixel_satd_8x8_internal2 2038 call pixel_satd_8x8_internal2 2039 call pixel_satd_8x8_internal2 2040 call pixel_satd_8x8_internal2 2041 call pixel_satd_8x8_internal2 2042 call pixel_satd_8x8_internal2 2043 lea r0, [r6 + 32*SIZEOF_PIXEL] 2044 lea r2, [r7 + 32*SIZEOF_PIXEL] 2045 call pixel_satd_8x8_internal2 2046 call pixel_satd_8x8_internal2 2047 call pixel_satd_8x8_internal2 2048 call pixel_satd_8x8_internal2 2049 call pixel_satd_8x8_internal2 2050 call pixel_satd_8x8_internal2 2051 call pixel_satd_8x8_internal2 2052 call pixel_satd_8x8_internal2 2053 lea r0, [r6 + 40*SIZEOF_PIXEL] 2054 lea r2, [r7 + 40*SIZEOF_PIXEL] 2055 call pixel_satd_8x8_internal2 2056 call pixel_satd_8x8_internal2 2057 call pixel_satd_8x8_internal2 2058 call pixel_satd_8x8_internal2 2059 call pixel_satd_8x8_internal2 2060 call pixel_satd_8x8_internal2 2061 call pixel_satd_8x8_internal2 2062 call pixel_satd_8x8_internal2 2063 HADDD m6, m0 2064 movd eax, m6 2065 RET 2066%else 2067cglobal pixel_satd_48x64, 4,7,8,0-gprsize ;if !WIN64 2068 SATD_START_SSE2 m6, m7 2069 mov r6, r0 2070 mov [rsp], r2 2071 call pixel_satd_8x8_internal2 2072 call pixel_satd_8x8_internal2 2073 call pixel_satd_8x8_internal2 2074 call pixel_satd_8x8_internal2 2075 call pixel_satd_8x8_internal2 2076 call pixel_satd_8x8_internal2 2077 call pixel_satd_8x8_internal2 2078 call pixel_satd_8x8_internal2 2079 lea r0, [r6 + 8*SIZEOF_PIXEL] 2080 mov r2, [rsp] 2081 add r2,8*SIZEOF_PIXEL 2082 call pixel_satd_8x8_internal2 2083 call pixel_satd_8x8_internal2 2084 call pixel_satd_8x8_internal2 2085 call pixel_satd_8x8_internal2 2086 call pixel_satd_8x8_internal2 2087 call pixel_satd_8x8_internal2 2088 call pixel_satd_8x8_internal2 2089 call pixel_satd_8x8_internal2 2090 lea r0, [r6 + 16*SIZEOF_PIXEL] 2091 mov r2, [rsp] 2092 add r2,16*SIZEOF_PIXEL 2093 call pixel_satd_8x8_internal2 2094 call pixel_satd_8x8_internal2 2095 call pixel_satd_8x8_internal2 2096 call pixel_satd_8x8_internal2 2097 call pixel_satd_8x8_internal2 2098 call pixel_satd_8x8_internal2 2099 call pixel_satd_8x8_internal2 2100 call pixel_satd_8x8_internal2 2101 lea r0, [r6 + 24*SIZEOF_PIXEL] 2102 mov r2, [rsp] 2103 add r2,24*SIZEOF_PIXEL 2104 call pixel_satd_8x8_internal2 2105 call pixel_satd_8x8_internal2 2106 call pixel_satd_8x8_internal2 2107 call pixel_satd_8x8_internal2 2108 call pixel_satd_8x8_internal2 2109 call pixel_satd_8x8_internal2 2110 call pixel_satd_8x8_internal2 2111 call pixel_satd_8x8_internal2 2112 lea r0, [r6 + 32*SIZEOF_PIXEL] 2113 mov r2, [rsp] 2114 add r2,32*SIZEOF_PIXEL 2115 call pixel_satd_8x8_internal2 2116 call pixel_satd_8x8_internal2 2117 call pixel_satd_8x8_internal2 2118 call pixel_satd_8x8_internal2 2119 call pixel_satd_8x8_internal2 2120 call pixel_satd_8x8_internal2 2121 call pixel_satd_8x8_internal2 2122 call pixel_satd_8x8_internal2 2123 lea r0, [r6 + 40*SIZEOF_PIXEL] 2124 mov r2, [rsp] 2125 add r2,40*SIZEOF_PIXEL 2126 call pixel_satd_8x8_internal2 2127 call pixel_satd_8x8_internal2 2128 call pixel_satd_8x8_internal2 2129 call pixel_satd_8x8_internal2 2130 call pixel_satd_8x8_internal2 2131 call pixel_satd_8x8_internal2 2132 call pixel_satd_8x8_internal2 2133 call pixel_satd_8x8_internal2 2134 HADDD m6, m0 2135 movd eax, m6 2136 RET 2137%endif 2138 2139 2140%if WIN64 2141cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && cpuflag(avx) 2142 SATD_START_SSE2 m6, m7 2143 mov r6, r0 2144 mov r7, r2 2145 call pixel_satd_8x8_internal2 2146 call pixel_satd_8x8_internal2 2147 lea r0, [r6 + 8*SIZEOF_PIXEL] 2148 lea r2, [r7 + 8*SIZEOF_PIXEL] 2149 call pixel_satd_8x8_internal2 2150 call pixel_satd_8x8_internal2 2151 lea r0, [r6 + 16*SIZEOF_PIXEL] 2152 lea r2, [r7 + 16*SIZEOF_PIXEL] 2153 call pixel_satd_8x8_internal2 2154 call pixel_satd_8x8_internal2 2155 lea r0, [r6 + 24*SIZEOF_PIXEL] 2156 lea r2, [r7 + 24*SIZEOF_PIXEL] 2157 call pixel_satd_8x8_internal2 2158 call pixel_satd_8x8_internal2 2159 lea r0, [r6 + 32*SIZEOF_PIXEL] 2160 lea r2, [r7 + 32*SIZEOF_PIXEL] 2161 call pixel_satd_8x8_internal2 2162 call pixel_satd_8x8_internal2 2163 lea r0, [r6 + 40*SIZEOF_PIXEL] 2164 lea r2, [r7 + 40*SIZEOF_PIXEL] 2165 call pixel_satd_8x8_internal2 2166 call pixel_satd_8x8_internal2 2167 lea r0, [r6 + 48*SIZEOF_PIXEL] 2168 lea r2, [r7 + 48*SIZEOF_PIXEL] 2169 call pixel_satd_8x8_internal2 2170 call pixel_satd_8x8_internal2 2171 lea r0, [r6 + 56*SIZEOF_PIXEL] 2172 lea r2, [r7 + 56*SIZEOF_PIXEL] 2173 call pixel_satd_8x8_internal2 2174 call pixel_satd_8x8_internal2 2175 HADDD m6, m0 2176 movd eax, m6 2177 RET 2178%else 2179cglobal pixel_satd_64x16, 4,7,8,0-gprsize ;if !WIN64 2180 SATD_START_SSE2 m6, m7 2181 mov r6, r0 2182 mov [rsp], r2 2183 call pixel_satd_8x8_internal2 2184 call pixel_satd_8x8_internal2 2185 lea r0, [r6 + 8*SIZEOF_PIXEL] 2186 mov r2, [rsp] 2187 add r2,8*SIZEOF_PIXEL 2188 call pixel_satd_8x8_internal2 2189 call pixel_satd_8x8_internal2 2190 lea r0, [r6 + 16*SIZEOF_PIXEL] 2191 mov r2, [rsp] 2192 add r2,16*SIZEOF_PIXEL 2193 call pixel_satd_8x8_internal2 2194 call pixel_satd_8x8_internal2 2195 lea r0, [r6 + 24*SIZEOF_PIXEL] 2196 mov r2, [rsp] 2197 add r2,24*SIZEOF_PIXEL 2198 call pixel_satd_8x8_internal2 2199 call pixel_satd_8x8_internal2 2200 lea r0, [r6 + 32*SIZEOF_PIXEL] 2201 mov r2, [rsp] 2202 add r2,32*SIZEOF_PIXEL 2203 call pixel_satd_8x8_internal2 2204 call pixel_satd_8x8_internal2 2205 lea r0, [r6 + 40*SIZEOF_PIXEL] 2206 mov r2, [rsp] 2207 add r2,40*SIZEOF_PIXEL 2208 call pixel_satd_8x8_internal2 2209 call pixel_satd_8x8_internal2 2210 lea r0, [r6 + 48*SIZEOF_PIXEL] 2211 mov r2, [rsp] 2212 add r2,48*SIZEOF_PIXEL 2213 call pixel_satd_8x8_internal2 2214 call pixel_satd_8x8_internal2 2215 lea r0, [r6 + 56*SIZEOF_PIXEL] 2216 mov r2, [rsp] 2217 add r2,56*SIZEOF_PIXEL 2218 call pixel_satd_8x8_internal2 2219 call pixel_satd_8x8_internal2 2220 HADDD m6, m0 2221 movd eax, m6 2222 RET 2223%endif 2224 2225%if WIN64 2226cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && cpuflag(avx) 2227 SATD_START_SSE2 m6, m7 2228 mov r6, r0 2229 mov r7, r2 2230 call pixel_satd_8x8_internal2 2231 call pixel_satd_8x8_internal2 2232 call pixel_satd_8x8_internal2 2233 call pixel_satd_8x8_internal2 2234 lea r0, [r6 + 8*SIZEOF_PIXEL] 2235 lea r2, [r7 + 8*SIZEOF_PIXEL] 2236 call pixel_satd_8x8_internal2 2237 call pixel_satd_8x8_internal2 2238 call pixel_satd_8x8_internal2 2239 call pixel_satd_8x8_internal2 2240 lea r0, [r6 + 16*SIZEOF_PIXEL] 2241 lea r2, [r7 + 16*SIZEOF_PIXEL] 2242 call pixel_satd_8x8_internal2 2243 call pixel_satd_8x8_internal2 2244 call pixel_satd_8x8_internal2 2245 call pixel_satd_8x8_internal2 2246 lea r0, [r6 + 24*SIZEOF_PIXEL] 2247 lea r2, [r7 + 24*SIZEOF_PIXEL] 2248 call pixel_satd_8x8_internal2 2249 call pixel_satd_8x8_internal2 2250 call pixel_satd_8x8_internal2 2251 call pixel_satd_8x8_internal2 2252 lea r0, [r6 + 32*SIZEOF_PIXEL] 2253 lea r2, [r7 + 32*SIZEOF_PIXEL] 2254 call pixel_satd_8x8_internal2 2255 call pixel_satd_8x8_internal2 2256 call pixel_satd_8x8_internal2 2257 call pixel_satd_8x8_internal2 2258 lea r0, [r6 + 40*SIZEOF_PIXEL] 2259 lea r2, [r7 + 40*SIZEOF_PIXEL] 2260 call pixel_satd_8x8_internal2 2261 call pixel_satd_8x8_internal2 2262 call pixel_satd_8x8_internal2 2263 call pixel_satd_8x8_internal2 2264 lea r0, [r6 + 48*SIZEOF_PIXEL] 2265 lea r2, [r7 + 48*SIZEOF_PIXEL] 2266 call pixel_satd_8x8_internal2 2267 call pixel_satd_8x8_internal2 2268 call pixel_satd_8x8_internal2 2269 call pixel_satd_8x8_internal2 2270 lea r0, [r6 + 56*SIZEOF_PIXEL] 2271 lea r2, [r7 + 56*SIZEOF_PIXEL] 2272 call pixel_satd_8x8_internal2 2273 call pixel_satd_8x8_internal2 2274 call pixel_satd_8x8_internal2 2275 call pixel_satd_8x8_internal2 2276 HADDD m6, m0 2277 movd eax, m6 2278 RET 2279%else 2280cglobal pixel_satd_64x32, 4,7,8,0-gprsize ;if !WIN64 2281 SATD_START_SSE2 m6, m7 2282 mov r6, r0 2283 mov [rsp], r2 2284 call pixel_satd_8x8_internal2 2285 call pixel_satd_8x8_internal2 2286 call pixel_satd_8x8_internal2 2287 call pixel_satd_8x8_internal2 2288 lea r0, [r6 + 8*SIZEOF_PIXEL] 2289 mov r2, [rsp] 2290 add r2, 8*SIZEOF_PIXEL 2291 call pixel_satd_8x8_internal2 2292 call pixel_satd_8x8_internal2 2293 call pixel_satd_8x8_internal2 2294 call pixel_satd_8x8_internal2 2295 lea r0, [r6 + 16*SIZEOF_PIXEL] 2296 mov r2, [rsp] 2297 add r2, 16*SIZEOF_PIXEL 2298 call pixel_satd_8x8_internal2 2299 call pixel_satd_8x8_internal2 2300 call pixel_satd_8x8_internal2 2301 call pixel_satd_8x8_internal2 2302 lea r0, [r6 + 24*SIZEOF_PIXEL] 2303 mov r2, [rsp] 2304 add r2, 24*SIZEOF_PIXEL 2305 call pixel_satd_8x8_internal2 2306 call pixel_satd_8x8_internal2 2307 call pixel_satd_8x8_internal2 2308 call pixel_satd_8x8_internal2 2309 lea r0, [r6 + 32*SIZEOF_PIXEL] 2310 mov r2, [rsp] 2311 add r2, 32*SIZEOF_PIXEL 2312 call pixel_satd_8x8_internal2 2313 call pixel_satd_8x8_internal2 2314 call pixel_satd_8x8_internal2 2315 call pixel_satd_8x8_internal2 2316 lea r0, [r6 + 40*SIZEOF_PIXEL] 2317 mov r2, [rsp] 2318 add r2, 40*SIZEOF_PIXEL 2319 call pixel_satd_8x8_internal2 2320 call pixel_satd_8x8_internal2 2321 call pixel_satd_8x8_internal2 2322 call pixel_satd_8x8_internal2 2323 lea r0, [r6 + 48*SIZEOF_PIXEL] 2324 mov r2, [rsp] 2325 add r2, 48*SIZEOF_PIXEL 2326 call pixel_satd_8x8_internal2 2327 call pixel_satd_8x8_internal2 2328 call pixel_satd_8x8_internal2 2329 call pixel_satd_8x8_internal2 2330 lea r0, [r6 + 56*SIZEOF_PIXEL] 2331 mov r2, [rsp] 2332 add r2, 56*SIZEOF_PIXEL 2333 call pixel_satd_8x8_internal2 2334 call pixel_satd_8x8_internal2 2335 call pixel_satd_8x8_internal2 2336 call pixel_satd_8x8_internal2 2337 HADDD m6, m0 2338 movd eax, m6 2339 RET 2340%endif 2341 2342%if WIN64 2343cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && cpuflag(avx) 2344 SATD_START_SSE2 m6, m7 2345 mov r6, r0 2346 mov r7, r2 2347 call pixel_satd_8x8_internal2 2348 call pixel_satd_8x8_internal2 2349 call pixel_satd_8x8_internal2 2350 call pixel_satd_8x8_internal2 2351 call pixel_satd_8x8_internal2 2352 call pixel_satd_8x8_internal2 2353 lea r0, [r6 + 8*SIZEOF_PIXEL] 2354 lea r2, [r7 + 8*SIZEOF_PIXEL] 2355 call pixel_satd_8x8_internal2 2356 call pixel_satd_8x8_internal2 2357 call pixel_satd_8x8_internal2 2358 call pixel_satd_8x8_internal2 2359 call pixel_satd_8x8_internal2 2360 call pixel_satd_8x8_internal2 2361 lea r0, [r6 + 16*SIZEOF_PIXEL] 2362 lea r2, [r7 + 16*SIZEOF_PIXEL] 2363 call pixel_satd_8x8_internal2 2364 call pixel_satd_8x8_internal2 2365 call pixel_satd_8x8_internal2 2366 call pixel_satd_8x8_internal2 2367 call pixel_satd_8x8_internal2 2368 call pixel_satd_8x8_internal2 2369 lea r0, [r6 + 24*SIZEOF_PIXEL] 2370 lea r2, [r7 + 24*SIZEOF_PIXEL] 2371 call pixel_satd_8x8_internal2 2372 call pixel_satd_8x8_internal2 2373 call pixel_satd_8x8_internal2 2374 call pixel_satd_8x8_internal2 2375 call pixel_satd_8x8_internal2 2376 call pixel_satd_8x8_internal2 2377 lea r0, [r6 + 32*SIZEOF_PIXEL] 2378 lea r2, [r7 + 32*SIZEOF_PIXEL] 2379 call pixel_satd_8x8_internal2 2380 call pixel_satd_8x8_internal2 2381 call pixel_satd_8x8_internal2 2382 call pixel_satd_8x8_internal2 2383 call pixel_satd_8x8_internal2 2384 call pixel_satd_8x8_internal2 2385 lea r0, [r6 + 40*SIZEOF_PIXEL] 2386 lea r2, [r7 + 40*SIZEOF_PIXEL] 2387 call pixel_satd_8x8_internal2 2388 call pixel_satd_8x8_internal2 2389 call pixel_satd_8x8_internal2 2390 call pixel_satd_8x8_internal2 2391 call pixel_satd_8x8_internal2 2392 call pixel_satd_8x8_internal2 2393 lea r0, [r6 + 48*SIZEOF_PIXEL] 2394 lea r2, [r7 + 48*SIZEOF_PIXEL] 2395 call pixel_satd_8x8_internal2 2396 call pixel_satd_8x8_internal2 2397 call pixel_satd_8x8_internal2 2398 call pixel_satd_8x8_internal2 2399 call pixel_satd_8x8_internal2 2400 call pixel_satd_8x8_internal2 2401 lea r0, [r6 + 56*SIZEOF_PIXEL] 2402 lea r2, [r7 + 56*SIZEOF_PIXEL] 2403 call pixel_satd_8x8_internal2 2404 call pixel_satd_8x8_internal2 2405 call pixel_satd_8x8_internal2 2406 call pixel_satd_8x8_internal2 2407 call pixel_satd_8x8_internal2 2408 call pixel_satd_8x8_internal2 2409 HADDD m6, m0 2410 movd eax, m6 2411 RET 2412%else 2413cglobal pixel_satd_64x48, 4,7,8,0-gprsize ;if !WIN64 2414 SATD_START_SSE2 m6, m7 2415 mov r6, r0 2416 mov [rsp], r2 2417 call pixel_satd_8x8_internal2 2418 call pixel_satd_8x8_internal2 2419 call pixel_satd_8x8_internal2 2420 call pixel_satd_8x8_internal2 2421 call pixel_satd_8x8_internal2 2422 call pixel_satd_8x8_internal2 2423 lea r0, [r6 + 8*SIZEOF_PIXEL] 2424 mov r2, [rsp] 2425 add r2, 8*SIZEOF_PIXEL 2426 call pixel_satd_8x8_internal2 2427 call pixel_satd_8x8_internal2 2428 call pixel_satd_8x8_internal2 2429 call pixel_satd_8x8_internal2 2430 call pixel_satd_8x8_internal2 2431 call pixel_satd_8x8_internal2 2432 lea r0, [r6 + 16*SIZEOF_PIXEL] 2433 mov r2, [rsp] 2434 add r2, 16*SIZEOF_PIXEL 2435 call pixel_satd_8x8_internal2 2436 call pixel_satd_8x8_internal2 2437 call pixel_satd_8x8_internal2 2438 call pixel_satd_8x8_internal2 2439 call pixel_satd_8x8_internal2 2440 call pixel_satd_8x8_internal2 2441 lea r0, [r6 + 24*SIZEOF_PIXEL] 2442 mov r2, [rsp] 2443 add r2, 24*SIZEOF_PIXEL 2444 call pixel_satd_8x8_internal2 2445 call pixel_satd_8x8_internal2 2446 call pixel_satd_8x8_internal2 2447 call pixel_satd_8x8_internal2 2448 call pixel_satd_8x8_internal2 2449 call pixel_satd_8x8_internal2 2450 lea r0, [r6 + 32*SIZEOF_PIXEL] 2451 mov r2, [rsp] 2452 add r2, 32*SIZEOF_PIXEL 2453 call pixel_satd_8x8_internal2 2454 call pixel_satd_8x8_internal2 2455 call pixel_satd_8x8_internal2 2456 call pixel_satd_8x8_internal2 2457 call pixel_satd_8x8_internal2 2458 call pixel_satd_8x8_internal2 2459 lea r0, [r6 + 40*SIZEOF_PIXEL] 2460 mov r2, [rsp] 2461 add r2, 40*SIZEOF_PIXEL 2462 call pixel_satd_8x8_internal2 2463 call pixel_satd_8x8_internal2 2464 call pixel_satd_8x8_internal2 2465 call pixel_satd_8x8_internal2 2466 call pixel_satd_8x8_internal2 2467 call pixel_satd_8x8_internal2 2468 lea r0, [r6 + 48*SIZEOF_PIXEL] 2469 mov r2, [rsp] 2470 add r2, 48*SIZEOF_PIXEL 2471 call pixel_satd_8x8_internal2 2472 call pixel_satd_8x8_internal2 2473 call pixel_satd_8x8_internal2 2474 call pixel_satd_8x8_internal2 2475 call pixel_satd_8x8_internal2 2476 call pixel_satd_8x8_internal2 2477 lea r0, [r6 + 56*SIZEOF_PIXEL] 2478 mov r2, [rsp] 2479 add r2, 56*SIZEOF_PIXEL 2480 call pixel_satd_8x8_internal2 2481 call pixel_satd_8x8_internal2 2482 call pixel_satd_8x8_internal2 2483 call pixel_satd_8x8_internal2 2484 call pixel_satd_8x8_internal2 2485 call pixel_satd_8x8_internal2 2486 HADDD m6, m0 2487 movd eax, m6 2488 RET 2489%endif 2490 2491%if WIN64 2492cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && cpuflag(avx) 2493 SATD_START_SSE2 m6, m7 2494 mov r6, r0 2495 mov r7, r2 2496 call pixel_satd_8x8_internal2 2497 call pixel_satd_8x8_internal2 2498 call pixel_satd_8x8_internal2 2499 call pixel_satd_8x8_internal2 2500 call pixel_satd_8x8_internal2 2501 call pixel_satd_8x8_internal2 2502 call pixel_satd_8x8_internal2 2503 call pixel_satd_8x8_internal2 2504 lea r0, [r6 + 8*SIZEOF_PIXEL] 2505 lea r2, [r7 + 8*SIZEOF_PIXEL] 2506 call pixel_satd_8x8_internal2 2507 call pixel_satd_8x8_internal2 2508 call pixel_satd_8x8_internal2 2509 call pixel_satd_8x8_internal2 2510 call pixel_satd_8x8_internal2 2511 call pixel_satd_8x8_internal2 2512 call pixel_satd_8x8_internal2 2513 call pixel_satd_8x8_internal2 2514 lea r0, [r6 + 16*SIZEOF_PIXEL] 2515 lea r2, [r7 + 16*SIZEOF_PIXEL] 2516 call pixel_satd_8x8_internal2 2517 call pixel_satd_8x8_internal2 2518 call pixel_satd_8x8_internal2 2519 call pixel_satd_8x8_internal2 2520 call pixel_satd_8x8_internal2 2521 call pixel_satd_8x8_internal2 2522 call pixel_satd_8x8_internal2 2523 call pixel_satd_8x8_internal2 2524 lea r0, [r6 + 24*SIZEOF_PIXEL] 2525 lea r2, [r7 + 24*SIZEOF_PIXEL] 2526 call pixel_satd_8x8_internal2 2527 call pixel_satd_8x8_internal2 2528 call pixel_satd_8x8_internal2 2529 call pixel_satd_8x8_internal2 2530 call pixel_satd_8x8_internal2 2531 call pixel_satd_8x8_internal2 2532 call pixel_satd_8x8_internal2 2533 call pixel_satd_8x8_internal2 2534 lea r0, [r6 + 32*SIZEOF_PIXEL] 2535 lea r2, [r7 + 32*SIZEOF_PIXEL] 2536 call pixel_satd_8x8_internal2 2537 call pixel_satd_8x8_internal2 2538 call pixel_satd_8x8_internal2 2539 call pixel_satd_8x8_internal2 2540 call pixel_satd_8x8_internal2 2541 call pixel_satd_8x8_internal2 2542 call pixel_satd_8x8_internal2 2543 call pixel_satd_8x8_internal2 2544 lea r0, [r6 + 40*SIZEOF_PIXEL] 2545 lea r2, [r7 + 40*SIZEOF_PIXEL] 2546 call pixel_satd_8x8_internal2 2547 call pixel_satd_8x8_internal2 2548 call pixel_satd_8x8_internal2 2549 call pixel_satd_8x8_internal2 2550 call pixel_satd_8x8_internal2 2551 call pixel_satd_8x8_internal2 2552 call pixel_satd_8x8_internal2 2553 call pixel_satd_8x8_internal2 2554 lea r0, [r6 + 48*SIZEOF_PIXEL] 2555 lea r2, [r7 + 48*SIZEOF_PIXEL] 2556 call pixel_satd_8x8_internal2 2557 call pixel_satd_8x8_internal2 2558 call pixel_satd_8x8_internal2 2559 call pixel_satd_8x8_internal2 2560 call pixel_satd_8x8_internal2 2561 call pixel_satd_8x8_internal2 2562 call pixel_satd_8x8_internal2 2563 call pixel_satd_8x8_internal2 2564 lea r0, [r6 + 56*SIZEOF_PIXEL] 2565 lea r2, [r7 + 56*SIZEOF_PIXEL] 2566 call pixel_satd_8x8_internal2 2567 call pixel_satd_8x8_internal2 2568 call pixel_satd_8x8_internal2 2569 call pixel_satd_8x8_internal2 2570 call pixel_satd_8x8_internal2 2571 call pixel_satd_8x8_internal2 2572 call pixel_satd_8x8_internal2 2573 call pixel_satd_8x8_internal2 2574 HADDD m6, m0 2575 movd eax, m6 2576 RET 2577%else 2578cglobal pixel_satd_64x64, 4,7,8,0-gprsize ;if !WIN64 2579 SATD_START_SSE2 m6, m7 2580 mov r6, r0 2581 mov [rsp], r2 2582 call pixel_satd_8x8_internal2 2583 call pixel_satd_8x8_internal2 2584 call pixel_satd_8x8_internal2 2585 call pixel_satd_8x8_internal2 2586 call pixel_satd_8x8_internal2 2587 call pixel_satd_8x8_internal2 2588 call pixel_satd_8x8_internal2 2589 call pixel_satd_8x8_internal2 2590 lea r0, [r6 + 8*SIZEOF_PIXEL] 2591 mov r2, [rsp] 2592 add r2, 8*SIZEOF_PIXEL 2593 call pixel_satd_8x8_internal2 2594 call pixel_satd_8x8_internal2 2595 call pixel_satd_8x8_internal2 2596 call pixel_satd_8x8_internal2 2597 call pixel_satd_8x8_internal2 2598 call pixel_satd_8x8_internal2 2599 call pixel_satd_8x8_internal2 2600 call pixel_satd_8x8_internal2 2601 lea r0, [r6 + 16*SIZEOF_PIXEL] 2602 mov r2, [rsp] 2603 add r2, 16*SIZEOF_PIXEL 2604 call pixel_satd_8x8_internal2 2605 call pixel_satd_8x8_internal2 2606 call pixel_satd_8x8_internal2 2607 call pixel_satd_8x8_internal2 2608 call pixel_satd_8x8_internal2 2609 call pixel_satd_8x8_internal2 2610 call pixel_satd_8x8_internal2 2611 call pixel_satd_8x8_internal2 2612 lea r0, [r6 + 24*SIZEOF_PIXEL] 2613 mov r2, [rsp] 2614 add r2, 24*SIZEOF_PIXEL 2615 call pixel_satd_8x8_internal2 2616 call pixel_satd_8x8_internal2 2617 call pixel_satd_8x8_internal2 2618 call pixel_satd_8x8_internal2 2619 call pixel_satd_8x8_internal2 2620 call pixel_satd_8x8_internal2 2621 call pixel_satd_8x8_internal2 2622 call pixel_satd_8x8_internal2 2623 lea r0, [r6 + 32*SIZEOF_PIXEL] 2624 mov r2, [rsp] 2625 add r2, 32*SIZEOF_PIXEL 2626 call pixel_satd_8x8_internal2 2627 call pixel_satd_8x8_internal2 2628 call pixel_satd_8x8_internal2 2629 call pixel_satd_8x8_internal2 2630 call pixel_satd_8x8_internal2 2631 call pixel_satd_8x8_internal2 2632 call pixel_satd_8x8_internal2 2633 call pixel_satd_8x8_internal2 2634 lea r0, [r6 + 40*SIZEOF_PIXEL] 2635 mov r2, [rsp] 2636 add r2, 40*SIZEOF_PIXEL 2637 call pixel_satd_8x8_internal2 2638 call pixel_satd_8x8_internal2 2639 call pixel_satd_8x8_internal2 2640 call pixel_satd_8x8_internal2 2641 call pixel_satd_8x8_internal2 2642 call pixel_satd_8x8_internal2 2643 call pixel_satd_8x8_internal2 2644 call pixel_satd_8x8_internal2 2645 lea r0, [r6 + 48*SIZEOF_PIXEL] 2646 mov r2, [rsp] 2647 add r2, 48*SIZEOF_PIXEL 2648 call pixel_satd_8x8_internal2 2649 call pixel_satd_8x8_internal2 2650 call pixel_satd_8x8_internal2 2651 call pixel_satd_8x8_internal2 2652 call pixel_satd_8x8_internal2 2653 call pixel_satd_8x8_internal2 2654 call pixel_satd_8x8_internal2 2655 call pixel_satd_8x8_internal2 2656 lea r0, [r6 + 56*SIZEOF_PIXEL] 2657 mov r2, [rsp] 2658 add r2, 56*SIZEOF_PIXEL 2659 call pixel_satd_8x8_internal2 2660 call pixel_satd_8x8_internal2 2661 call pixel_satd_8x8_internal2 2662 call pixel_satd_8x8_internal2 2663 call pixel_satd_8x8_internal2 2664 call pixel_satd_8x8_internal2 2665 call pixel_satd_8x8_internal2 2666 call pixel_satd_8x8_internal2 2667 HADDD m6, m0 2668 movd eax, m6 2669 RET 2670%endif 2671 2672%if WIN64 2673cglobal pixel_satd_16x4, 4,6,14 2674%else 2675cglobal pixel_satd_16x4, 4,6,8 2676%endif 2677 SATD_START_SSE2 m6, m7 2678 BACKUP_POINTERS 2679 call %%pixel_satd_8x4_internal2 2680 RESTORE_AND_INC_POINTERS 2681 call %%pixel_satd_8x4_internal2 2682 HADDD m6, m0 2683 movd eax, m6 2684 RET 2685 2686%if WIN64 2687cglobal pixel_satd_16x8, 4,6,14 2688%else 2689cglobal pixel_satd_16x8, 4,6,8 2690%endif 2691 SATD_START_SSE2 m6, m7 2692 BACKUP_POINTERS 2693 call pixel_satd_8x8_internal2 2694 RESTORE_AND_INC_POINTERS 2695 call pixel_satd_8x8_internal2 2696 HADDD m6, m0 2697 movd eax, m6 2698 RET 2699 2700%if WIN64 2701cglobal pixel_satd_16x12, 4,6,14 2702%else 2703cglobal pixel_satd_16x12, 4,6,8 2704%endif 2705 SATD_START_SSE2 m6, m7, 1 2706 BACKUP_POINTERS 2707 call pixel_satd_8x8_internal2 2708 call %%pixel_satd_8x4_internal2 2709 RESTORE_AND_INC_POINTERS 2710 call pixel_satd_8x8_internal2 2711 call %%pixel_satd_8x4_internal2 2712 HADDD m6, m0 2713 movd eax, m6 2714 RET 2715 2716%if WIN64 2717cglobal pixel_satd_16x16, 4,6,14 2718%else 2719cglobal pixel_satd_16x16, 4,6,8 2720%endif 2721 SATD_START_SSE2 m6, m7, 1 2722 BACKUP_POINTERS 2723 call pixel_satd_8x8_internal2 2724 call pixel_satd_8x8_internal2 2725 RESTORE_AND_INC_POINTERS 2726 call pixel_satd_8x8_internal2 2727 call pixel_satd_8x8_internal2 2728 HADDD m6, m0 2729 movd eax, m6 2730 RET 2731 2732%if WIN64 2733cglobal pixel_satd_16x32, 4,6,14 2734%else 2735cglobal pixel_satd_16x32, 4,6,8 2736%endif 2737 SATD_START_SSE2 m6, m7, 1 2738 BACKUP_POINTERS 2739 call pixel_satd_8x8_internal2 2740 call pixel_satd_8x8_internal2 2741 call pixel_satd_8x8_internal2 2742 call pixel_satd_8x8_internal2 2743 RESTORE_AND_INC_POINTERS 2744 call pixel_satd_8x8_internal2 2745 call pixel_satd_8x8_internal2 2746 call pixel_satd_8x8_internal2 2747 call pixel_satd_8x8_internal2 2748 HADDD m6, m0 2749 movd eax, m6 2750 RET 2751 2752%if WIN64 2753cglobal pixel_satd_16x64, 4,6,14 2754%else 2755cglobal pixel_satd_16x64, 4,6,8 2756%endif 2757 SATD_START_SSE2 m6, m7, 1 2758 BACKUP_POINTERS 2759 call pixel_satd_8x8_internal2 2760 call pixel_satd_8x8_internal2 2761 call pixel_satd_8x8_internal2 2762 call pixel_satd_8x8_internal2 2763 call pixel_satd_8x8_internal2 2764 call pixel_satd_8x8_internal2 2765 call pixel_satd_8x8_internal2 2766 call pixel_satd_8x8_internal2 2767 RESTORE_AND_INC_POINTERS 2768 call pixel_satd_8x8_internal2 2769 call pixel_satd_8x8_internal2 2770 call pixel_satd_8x8_internal2 2771 call pixel_satd_8x8_internal2 2772 call pixel_satd_8x8_internal2 2773 call pixel_satd_8x8_internal2 2774 call pixel_satd_8x8_internal2 2775 call pixel_satd_8x8_internal2 2776 HADDD m6, m0 2777 movd eax, m6 2778 RET 2779%endif 2780 2781%if HIGH_BIT_DEPTH 2782%if WIN64 2783cglobal pixel_satd_12x16, 4,8,8 2784 SATD_START_MMX 2785 mov r6, r0 2786 mov r7, r2 2787 pxor m7, m7 2788 SATD_4x8_SSE vertical, 0, 4, 5 2789 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 2790 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 2791 SATD_4x8_SSE vertical, 1, 4, 5 2792 lea r0, [r6 + 4*SIZEOF_PIXEL] 2793 lea r2, [r7 + 4*SIZEOF_PIXEL] 2794 SATD_4x8_SSE vertical, 1, 4, 5 2795 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 2796 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 2797 SATD_4x8_SSE vertical, 1, 4, 5 2798 lea r0, [r6 + 8*SIZEOF_PIXEL] 2799 lea r2, [r7 + 8*SIZEOF_PIXEL] 2800 SATD_4x8_SSE vertical, 1, 4, 5 2801 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 2802 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 2803 SATD_4x8_SSE vertical, 1, 4, 5 2804 HADDD m7, m0 2805 movd eax, m7 2806 RET 2807%else 2808cglobal pixel_satd_12x16, 4,7,8,0-gprsize 2809 SATD_START_MMX 2810 mov r6, r0 2811 mov [rsp], r2 2812 pxor m7, m7 2813 SATD_4x8_SSE vertical, 0, 4, 5 2814 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 2815 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 2816 SATD_4x8_SSE vertical, 1, 4, 5 2817 lea r0, [r6 + 4*SIZEOF_PIXEL] 2818 mov r2, [rsp] 2819 add r2, 4*SIZEOF_PIXEL 2820 SATD_4x8_SSE vertical, 1, 4, 5 2821 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 2822 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 2823 SATD_4x8_SSE vertical, 1, 4, 5 2824 lea r0, [r6 + 8*SIZEOF_PIXEL] 2825 mov r2, [rsp] 2826 add r2, 8*SIZEOF_PIXEL 2827 SATD_4x8_SSE vertical, 1, 4, 5 2828 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 2829 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 2830 SATD_4x8_SSE vertical, 1, 4, 5 2831 HADDD m7, m0 2832 movd eax, m7 2833 RET 2834%endif 2835%else ;HIGH_BIT_DEPTH 2836%if WIN64 2837cglobal pixel_satd_12x16, 4,8,8 2838 SATD_START_MMX 2839 mov r6, r0 2840 mov r7, r2 2841%if vertical==0 2842 mova m7, [hmul_4p] 2843%endif 2844 SATD_4x8_SSE vertical, 0, swap 2845 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 2846 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 2847 SATD_4x8_SSE vertical, 1, add 2848 lea r0, [r6 + 4*SIZEOF_PIXEL] 2849 lea r2, [r7 + 4*SIZEOF_PIXEL] 2850 SATD_4x8_SSE vertical, 1, add 2851 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 2852 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 2853 SATD_4x8_SSE vertical, 1, add 2854 lea r0, [r6 + 8*SIZEOF_PIXEL] 2855 lea r2, [r7 + 8*SIZEOF_PIXEL] 2856 SATD_4x8_SSE vertical, 1, add 2857 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 2858 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 2859 SATD_4x8_SSE vertical, 1, add 2860 HADDW m7, m1 2861 movd eax, m7 2862 RET 2863%else 2864cglobal pixel_satd_12x16, 4,7,8,0-gprsize 2865 SATD_START_MMX 2866 mov r6, r0 2867 mov [rsp], r2 2868%if vertical==0 2869 mova m7, [hmul_4p] 2870%endif 2871 SATD_4x8_SSE vertical, 0, swap 2872 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 2873 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 2874 SATD_4x8_SSE vertical, 1, add 2875 lea r0, [r6 + 4*SIZEOF_PIXEL] 2876 mov r2, [rsp] 2877 add r2, 4*SIZEOF_PIXEL 2878 SATD_4x8_SSE vertical, 1, add 2879 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 2880 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 2881 SATD_4x8_SSE vertical, 1, add 2882 lea r0, [r6 + 8*SIZEOF_PIXEL] 2883 mov r2, [rsp] 2884 add r2, 8*SIZEOF_PIXEL 2885 SATD_4x8_SSE vertical, 1, add 2886 lea r0, [r0 + r1*2*SIZEOF_PIXEL] 2887 lea r2, [r2 + r3*2*SIZEOF_PIXEL] 2888 SATD_4x8_SSE vertical, 1, add 2889 HADDW m7, m1 2890 movd eax, m7 2891 RET 2892%endif 2893%endif 2894 2895%if WIN64 2896cglobal pixel_satd_24x32, 4,8,14 2897 SATD_START_SSE2 m6, m7 2898 mov r6, r0 2899 mov r7, r2 2900 call pixel_satd_8x8_internal2 2901 call pixel_satd_8x8_internal2 2902 call pixel_satd_8x8_internal2 2903 call pixel_satd_8x8_internal2 2904 lea r0, [r6 + 8*SIZEOF_PIXEL] 2905 lea r2, [r7 + 8*SIZEOF_PIXEL] 2906 call pixel_satd_8x8_internal2 2907 call pixel_satd_8x8_internal2 2908 call pixel_satd_8x8_internal2 2909 call pixel_satd_8x8_internal2 2910 lea r0, [r6 + 16*SIZEOF_PIXEL] 2911 lea r2, [r7 + 16*SIZEOF_PIXEL] 2912 call pixel_satd_8x8_internal2 2913 call pixel_satd_8x8_internal2 2914 call pixel_satd_8x8_internal2 2915 call pixel_satd_8x8_internal2 2916 HADDD m6, m0 2917 movd eax, m6 2918 RET 2919%else 2920cglobal pixel_satd_24x32, 4,7,8,0-gprsize 2921 SATD_START_SSE2 m6, m7 2922 mov r6, r0 2923 mov [rsp], r2 2924 call pixel_satd_8x8_internal2 2925 call pixel_satd_8x8_internal2 2926 call pixel_satd_8x8_internal2 2927 call pixel_satd_8x8_internal2 2928 lea r0, [r6 + 8*SIZEOF_PIXEL] 2929 mov r2, [rsp] 2930 add r2, 8*SIZEOF_PIXEL 2931 call pixel_satd_8x8_internal2 2932 call pixel_satd_8x8_internal2 2933 call pixel_satd_8x8_internal2 2934 call pixel_satd_8x8_internal2 2935 lea r0, [r6 + 16*SIZEOF_PIXEL] 2936 mov r2, [rsp] 2937 add r2, 16*SIZEOF_PIXEL 2938 call pixel_satd_8x8_internal2 2939 call pixel_satd_8x8_internal2 2940 call pixel_satd_8x8_internal2 2941 call pixel_satd_8x8_internal2 2942 HADDD m6, m0 2943 movd eax, m6 2944 RET 2945%endif ;WIN64 2946 2947%if WIN64 2948cglobal pixel_satd_8x32, 4,6,14 2949%else 2950cglobal pixel_satd_8x32, 4,6,8 2951%endif 2952 SATD_START_SSE2 m6, m7 2953%if vertical 2954 mova m7, [pw_00ff] 2955%endif 2956 call pixel_satd_8x8_internal2 2957 call pixel_satd_8x8_internal2 2958 call pixel_satd_8x8_internal2 2959 call pixel_satd_8x8_internal2 2960 HADDD m6, m0 2961 movd eax, m6 2962 RET 2963 2964%if WIN64 2965cglobal pixel_satd_8x16, 4,6,14 2966%else 2967cglobal pixel_satd_8x16, 4,6,8 2968%endif 2969 SATD_START_SSE2 m6, m7 2970 call pixel_satd_8x8_internal2 2971 call pixel_satd_8x8_internal2 2972 HADDD m6, m0 2973 movd eax, m6 2974 RET 2975 2976cglobal pixel_satd_8x8, 4,6,8 2977 SATD_START_SSE2 m6, m7 2978 call pixel_satd_8x8_internal 2979 SATD_END_SSE2 m6 2980 2981%if WIN64 2982cglobal pixel_satd_8x4, 4,6,14 2983%else 2984cglobal pixel_satd_8x4, 4,6,8 2985%endif 2986 SATD_START_SSE2 m6, m7 2987 call %%pixel_satd_8x4_internal2 2988 SATD_END_SSE2 m6 2989%endmacro ; SATDS_SSE2 2990 2991 2992;============================================================================= 2993; SA8D 2994;============================================================================= 2995 2996%macro SA8D_INTER 0 2997%if ARCH_X86_64 2998 %define lh m10 2999 %define rh m0 3000%else 3001 %define lh m0 3002 %define rh [esp+48] 3003%endif 3004%if HIGH_BIT_DEPTH 3005 HADDUW m0, m1 3006 paddd lh, rh 3007%else 3008 paddusw lh, rh 3009%endif ; HIGH_BIT_DEPTH 3010%endmacro 3011 3012%macro SA8D_8x8 0 3013 call pixel_sa8d_8x8_internal 3014%if HIGH_BIT_DEPTH 3015 HADDUW m0, m1 3016%else 3017 HADDW m0, m1 3018%endif ; HIGH_BIT_DEPTH 3019 paddd m0, [pd_1] 3020 psrld m0, 1 3021 paddd m12, m0 3022%endmacro 3023 3024%macro SA8D_16x16 0 3025 call pixel_sa8d_8x8_internal ; pix[0] 3026 add r2, 8*SIZEOF_PIXEL 3027 add r0, 8*SIZEOF_PIXEL 3028%if HIGH_BIT_DEPTH 3029 HADDUW m0, m1 3030%endif 3031 mova m10, m0 3032 call pixel_sa8d_8x8_internal ; pix[8] 3033 lea r2, [r2+8*r3] 3034 lea r0, [r0+8*r1] 3035 SA8D_INTER 3036 call pixel_sa8d_8x8_internal ; pix[8*stride+8] 3037 sub r2, 8*SIZEOF_PIXEL 3038 sub r0, 8*SIZEOF_PIXEL 3039 SA8D_INTER 3040 call pixel_sa8d_8x8_internal ; pix[8*stride] 3041 SA8D_INTER 3042 SWAP 0, 10 3043%if HIGH_BIT_DEPTH == 0 3044 HADDUW m0, m1 3045%endif 3046 paddd m0, [pd_1] 3047 psrld m0, 1 3048 paddd m12, m0 3049%endmacro 3050 3051%macro AVG_16x16 0 3052 SA8D_INTER 3053%if HIGH_BIT_DEPTH == 0 3054 HADDUW m0, m1 3055%endif 3056 movd r4d, m0 3057 add r4d, 1 3058 shr r4d, 1 3059 add r4d, dword [esp+36] 3060 mov dword [esp+36], r4d 3061%endmacro 3062 3063%macro SA8D 0 3064; sse2 doesn't seem to like the horizontal way of doing things 3065%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) 3066 3067%if ARCH_X86_64 3068;----------------------------------------------------------------------------- 3069; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) 3070;----------------------------------------------------------------------------- 3071cglobal pixel_sa8d_8x8_internal 3072 lea r6, [r0+4*r1] 3073 lea r7, [r2+4*r3] 3074 LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2 3075 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7 3076%if vertical 3077 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax 3078%else ; non-sse2 3079 HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11 3080%endif 3081 paddw m0, m1 3082 paddw m0, m2 3083 paddw m0, m8 3084 SAVE_MM_PERMUTATION 3085 ret 3086 3087cglobal pixel_sa8d_8x8, 4,8,12 3088 FIX_STRIDES r1, r3 3089 lea r4, [3*r1] 3090 lea r5, [3*r3] 3091%if vertical == 0 3092 mova m7, [hmul_8p] 3093%endif 3094 call pixel_sa8d_8x8_internal 3095%if HIGH_BIT_DEPTH 3096 HADDUW m0, m1 3097%else 3098 HADDW m0, m1 3099%endif ; HIGH_BIT_DEPTH 3100 movd eax, m0 3101 add eax, 1 3102 shr eax, 1 3103 RET 3104 3105cglobal pixel_sa8d_16x16, 4,8,12 3106 FIX_STRIDES r1, r3 3107 lea r4, [3*r1] 3108 lea r5, [3*r3] 3109%if vertical == 0 3110 mova m7, [hmul_8p] 3111%endif 3112 call pixel_sa8d_8x8_internal ; pix[0] 3113 add r2, 8*SIZEOF_PIXEL 3114 add r0, 8*SIZEOF_PIXEL 3115%if HIGH_BIT_DEPTH 3116 HADDUW m0, m1 3117%endif 3118 mova m10, m0 3119 call pixel_sa8d_8x8_internal ; pix[8] 3120 lea r2, [r2+8*r3] 3121 lea r0, [r0+8*r1] 3122 SA8D_INTER 3123 call pixel_sa8d_8x8_internal ; pix[8*stride+8] 3124 sub r2, 8*SIZEOF_PIXEL 3125 sub r0, 8*SIZEOF_PIXEL 3126 SA8D_INTER 3127 call pixel_sa8d_8x8_internal ; pix[8*stride] 3128 SA8D_INTER 3129 SWAP 0, 10 3130%if HIGH_BIT_DEPTH == 0 3131 HADDUW m0, m1 3132%endif 3133 movd eax, m0 3134 add eax, 1 3135 shr eax, 1 3136 RET 3137 3138cglobal pixel_sa8d_8x16, 4,8,13 3139 FIX_STRIDES r1, r3 3140 lea r4, [3*r1] 3141 lea r5, [3*r3] 3142 pxor m12, m12 3143%if vertical == 0 3144 mova m7, [hmul_8p] 3145%endif 3146 SA8D_8x8 3147 lea r0, [r0 + 8*r1] 3148 lea r2, [r2 + 8*r3] 3149 SA8D_8x8 3150 movd eax, m12 3151 RET 3152 3153cglobal pixel_sa8d_8x32, 4,8,13 3154 FIX_STRIDES r1, r3 3155 lea r4, [3*r1] 3156 lea r5, [3*r3] 3157 pxor m12, m12 3158%if vertical == 0 3159 mova m7, [hmul_8p] 3160%endif 3161 SA8D_8x8 3162 lea r0, [r0 + r1*8] 3163 lea r2, [r2 + r3*8] 3164 SA8D_8x8 3165 lea r0, [r0 + r1*8] 3166 lea r2, [r2 + r3*8] 3167 SA8D_8x8 3168 lea r0, [r0 + r1*8] 3169 lea r2, [r2 + r3*8] 3170 SA8D_8x8 3171 movd eax, m12 3172 RET 3173 3174cglobal pixel_sa8d_16x8, 4,8,13 3175 FIX_STRIDES r1, r3 3176 lea r4, [3*r1] 3177 lea r5, [3*r3] 3178 pxor m12, m12 3179%if vertical == 0 3180 mova m7, [hmul_8p] 3181%endif 3182 SA8D_8x8 3183 add r0, 8*SIZEOF_PIXEL 3184 add r2, 8*SIZEOF_PIXEL 3185 SA8D_8x8 3186 movd eax, m12 3187 RET 3188 3189cglobal pixel_sa8d_16x32, 4,8,13 3190 FIX_STRIDES r1, r3 3191 lea r4, [3*r1] 3192 lea r5, [3*r3] 3193 pxor m12, m12 3194%if vertical == 0 3195 mova m7, [hmul_8p] 3196%endif 3197 SA8D_16x16 3198 lea r0, [r0+8*r1] 3199 lea r2, [r2+8*r3] 3200 SA8D_16x16 3201 movd eax, m12 3202 RET 3203 3204cglobal pixel_sa8d_16x64, 4,8,13 3205 FIX_STRIDES r1, r3 3206 lea r4, [3*r1] 3207 lea r5, [3*r3] 3208 pxor m12, m12 3209%if vertical == 0 3210 mova m7, [hmul_8p] 3211%endif 3212 SA8D_16x16 3213 lea r0, [r0+8*r1] 3214 lea r2, [r2+8*r3] 3215 SA8D_16x16 3216 lea r0, [r0+8*r1] 3217 lea r2, [r2+8*r3] 3218 SA8D_16x16 3219 lea r0, [r0+8*r1] 3220 lea r2, [r2+8*r3] 3221 SA8D_16x16 3222 movd eax, m12 3223 RET 3224 3225cglobal pixel_sa8d_24x32, 4,8,13 3226 FIX_STRIDES r1, r3 3227 lea r4, [3*r1] 3228 lea r5, [3*r3] 3229 pxor m12, m12 3230%if vertical == 0 3231 mova m7, [hmul_8p] 3232%endif 3233 SA8D_8x8 3234 add r0, 8*SIZEOF_PIXEL 3235 add r2, 8*SIZEOF_PIXEL 3236 SA8D_8x8 3237 add r0, 8*SIZEOF_PIXEL 3238 add r2, 8*SIZEOF_PIXEL 3239 SA8D_8x8 3240 lea r0, [r0 + r1*8] 3241 lea r2, [r2 + r3*8] 3242 SA8D_8x8 3243 sub r0, 8*SIZEOF_PIXEL 3244 sub r2, 8*SIZEOF_PIXEL 3245 SA8D_8x8 3246 sub r0, 8*SIZEOF_PIXEL 3247 sub r2, 8*SIZEOF_PIXEL 3248 SA8D_8x8 3249 lea r0, [r0 + r1*8] 3250 lea r2, [r2 + r3*8] 3251 SA8D_8x8 3252 add r0, 8*SIZEOF_PIXEL 3253 add r2, 8*SIZEOF_PIXEL 3254 SA8D_8x8 3255 add r0, 8*SIZEOF_PIXEL 3256 add r2, 8*SIZEOF_PIXEL 3257 SA8D_8x8 3258 lea r0, [r0 + r1*8] 3259 lea r2, [r2 + r3*8] 3260 SA8D_8x8 3261 sub r0, 8*SIZEOF_PIXEL 3262 sub r2, 8*SIZEOF_PIXEL 3263 SA8D_8x8 3264 sub r0, 8*SIZEOF_PIXEL 3265 sub r2, 8*SIZEOF_PIXEL 3266 SA8D_8x8 3267 movd eax, m12 3268 RET 3269 3270cglobal pixel_sa8d_32x8, 4,8,13 3271 FIX_STRIDES r1, r3 3272 lea r4, [3*r1] 3273 lea r5, [3*r3] 3274 pxor m12, m12 3275%if vertical == 0 3276 mova m7, [hmul_8p] 3277%endif 3278 SA8D_8x8 3279 add r0, 8*SIZEOF_PIXEL 3280 add r2, 8*SIZEOF_PIXEL 3281 SA8D_8x8 3282 add r0, 8*SIZEOF_PIXEL 3283 add r2, 8*SIZEOF_PIXEL 3284 SA8D_8x8 3285 add r0, 8*SIZEOF_PIXEL 3286 add r2, 8*SIZEOF_PIXEL 3287 SA8D_8x8 3288 movd eax, m12 3289 RET 3290 3291cglobal pixel_sa8d_32x16, 4,8,13 3292 FIX_STRIDES r1, r3 3293 lea r4, [3*r1] 3294 lea r5, [3*r3] 3295 pxor m12, m12 3296%if vertical == 0 3297 mova m7, [hmul_8p] 3298%endif 3299 SA8D_16x16 3300 lea r4, [8*r1] 3301 lea r5, [8*r3] 3302 sub r0, r4 3303 sub r2, r5 3304 add r2, 16*SIZEOF_PIXEL 3305 add r0, 16*SIZEOF_PIXEL 3306 lea r4, [3*r1] 3307 lea r5, [3*r3] 3308 SA8D_16x16 3309 movd eax, m12 3310 RET 3311 3312cglobal pixel_sa8d_32x24, 4,8,13 3313 FIX_STRIDES r1, r3 3314 lea r4, [3*r1] 3315 lea r5, [3*r3] 3316 pxor m12, m12 3317%if vertical == 0 3318 mova m7, [hmul_8p] 3319%endif 3320 SA8D_8x8 3321 add r0, 8*SIZEOF_PIXEL 3322 add r2, 8*SIZEOF_PIXEL 3323 SA8D_8x8 3324 add r0, 8*SIZEOF_PIXEL 3325 add r2, 8*SIZEOF_PIXEL 3326 SA8D_8x8 3327 add r0, 8*SIZEOF_PIXEL 3328 add r2, 8*SIZEOF_PIXEL 3329 SA8D_8x8 3330 lea r0, [r0 + r1*8] 3331 lea r2, [r2 + r3*8] 3332 SA8D_8x8 3333 sub r0, 8*SIZEOF_PIXEL 3334 sub r2, 8*SIZEOF_PIXEL 3335 SA8D_8x8 3336 sub r0, 8*SIZEOF_PIXEL 3337 sub r2, 8*SIZEOF_PIXEL 3338 SA8D_8x8 3339 sub r0, 8*SIZEOF_PIXEL 3340 sub r2, 8*SIZEOF_PIXEL 3341 SA8D_8x8 3342 lea r0, [r0 + r1*8] 3343 lea r2, [r2 + r3*8] 3344 SA8D_8x8 3345 add r0, 8*SIZEOF_PIXEL 3346 add r2, 8*SIZEOF_PIXEL 3347 SA8D_8x8 3348 add r0, 8*SIZEOF_PIXEL 3349 add r2, 8*SIZEOF_PIXEL 3350 SA8D_8x8 3351 add r0, 8*SIZEOF_PIXEL 3352 add r2, 8*SIZEOF_PIXEL 3353 SA8D_8x8 3354 movd eax, m12 3355 RET 3356 3357cglobal pixel_sa8d_32x32, 4,8,13 3358 FIX_STRIDES r1, r3 3359 lea r4, [3*r1] 3360 lea r5, [3*r3] 3361 pxor m12, m12 3362%if vertical == 0 3363 mova m7, [hmul_8p] 3364%endif 3365 SA8D_16x16 3366 lea r4, [8*r1] 3367 lea r5, [8*r3] 3368 sub r0, r4 3369 sub r2, r5 3370 add r2, 16*SIZEOF_PIXEL 3371 add r0, 16*SIZEOF_PIXEL 3372 lea r4, [3*r1] 3373 lea r5, [3*r3] 3374 SA8D_16x16 3375 lea r0, [r0+8*r1] 3376 lea r2, [r2+8*r3] 3377 SA8D_16x16 3378 lea r4, [8*r1] 3379 lea r5, [8*r3] 3380 sub r0, r4 3381 sub r2, r5 3382 sub r2, 16*SIZEOF_PIXEL 3383 sub r0, 16*SIZEOF_PIXEL 3384 lea r4, [3*r1] 3385 lea r5, [3*r3] 3386 SA8D_16x16 3387 movd eax, m12 3388 RET 3389 3390cglobal pixel_sa8d_32x64, 4,8,13 3391 FIX_STRIDES r1, r3 3392 lea r4, [3*r1] 3393 lea r5, [3*r3] 3394 pxor m12, m12 3395%if vertical == 0 3396 mova m7, [hmul_8p] 3397%endif 3398 SA8D_16x16 3399 lea r4, [8*r1] 3400 lea r5, [8*r3] 3401 sub r0, r4 3402 sub r2, r5 3403 add r2, 16*SIZEOF_PIXEL 3404 add r0, 16*SIZEOF_PIXEL 3405 lea r4, [3*r1] 3406 lea r5, [3*r3] 3407 SA8D_16x16 3408 lea r0, [r0+8*r1] 3409 lea r2, [r2+8*r3] 3410 SA8D_16x16 3411 lea r4, [8*r1] 3412 lea r5, [8*r3] 3413 sub r0, r4 3414 sub r2, r5 3415 sub r2, 16*SIZEOF_PIXEL 3416 sub r0, 16*SIZEOF_PIXEL 3417 lea r4, [3*r1] 3418 lea r5, [3*r3] 3419 SA8D_16x16 3420 lea r0, [r0+8*r1] 3421 lea r2, [r2+8*r3] 3422 SA8D_16x16 3423 lea r4, [8*r1] 3424 lea r5, [8*r3] 3425 sub r0, r4 3426 sub r2, r5 3427 add r2, 16*SIZEOF_PIXEL 3428 add r0, 16*SIZEOF_PIXEL 3429 lea r4, [3*r1] 3430 lea r5, [3*r3] 3431 SA8D_16x16 3432 lea r0, [r0+8*r1] 3433 lea r2, [r2+8*r3] 3434 SA8D_16x16 3435 lea r4, [8*r1] 3436 lea r5, [8*r3] 3437 sub r0, r4 3438 sub r2, r5 3439 sub r2, 16*SIZEOF_PIXEL 3440 sub r0, 16*SIZEOF_PIXEL 3441 lea r4, [3*r1] 3442 lea r5, [3*r3] 3443 SA8D_16x16 3444 movd eax, m12 3445 RET 3446 3447cglobal pixel_sa8d_48x64, 4,8,13 3448 FIX_STRIDES r1, r3 3449 lea r4, [3*r1] 3450 lea r5, [3*r3] 3451 pxor m12, m12 3452%if vertical == 0 3453 mova m7, [hmul_8p] 3454%endif 3455 SA8D_16x16 3456 lea r4, [8*r1] 3457 lea r5, [8*r3] 3458 sub r0, r4 3459 sub r2, r5 3460 add r2, 16*SIZEOF_PIXEL 3461 add r0, 16*SIZEOF_PIXEL 3462 lea r4, [3*r1] 3463 lea r5, [3*r3] 3464 SA8D_16x16 3465 lea r4, [8*r1] 3466 lea r5, [8*r3] 3467 sub r0, r4 3468 sub r2, r5 3469 add r2, 16*SIZEOF_PIXEL 3470 add r0, 16*SIZEOF_PIXEL 3471 lea r4, [3*r1] 3472 lea r5, [3*r3] 3473 SA8D_16x16 3474 lea r0, [r0+8*r1] 3475 lea r2, [r2+8*r3] 3476 SA8D_16x16 3477 lea r4, [8*r1] 3478 lea r5, [8*r3] 3479 sub r0, r4 3480 sub r2, r5 3481 sub r2, 16*SIZEOF_PIXEL 3482 sub r0, 16*SIZEOF_PIXEL 3483 lea r4, [3*r1] 3484 lea r5, [3*r3] 3485 SA8D_16x16 3486 lea r4, [8*r1] 3487 lea r5, [8*r3] 3488 sub r0, r4 3489 sub r2, r5 3490 sub r2, 16*SIZEOF_PIXEL 3491 sub r0, 16*SIZEOF_PIXEL 3492 lea r4, [3*r1] 3493 lea r5, [3*r3] 3494 SA8D_16x16 3495 lea r0, [r0+8*r1] 3496 lea r2, [r2+8*r3] 3497 SA8D_16x16 3498 lea r4, [8*r1] 3499 lea r5, [8*r3] 3500 sub r0, r4 3501 sub r2, r5 3502 add r2, 16*SIZEOF_PIXEL 3503 add r0, 16*SIZEOF_PIXEL 3504 lea r4, [3*r1] 3505 lea r5, [3*r3] 3506 SA8D_16x16 3507 lea r4, [8*r1] 3508 lea r5, [8*r3] 3509 sub r0, r4 3510 sub r2, r5 3511 add r2, 16*SIZEOF_PIXEL 3512 add r0, 16*SIZEOF_PIXEL 3513 lea r4, [3*r1] 3514 lea r5, [3*r3] 3515 SA8D_16x16 3516 lea r0, [r0+8*r1] 3517 lea r2, [r2+8*r3] 3518 SA8D_16x16 3519 lea r4, [8*r1] 3520 lea r5, [8*r3] 3521 sub r0, r4 3522 sub r2, r5 3523 sub r2, 16*SIZEOF_PIXEL 3524 sub r0, 16*SIZEOF_PIXEL 3525 lea r4, [3*r1] 3526 lea r5, [3*r3] 3527 SA8D_16x16 3528 lea r4, [8*r1] 3529 lea r5, [8*r3] 3530 sub r0, r4 3531 sub r2, r5 3532 sub r2, 16*SIZEOF_PIXEL 3533 sub r0, 16*SIZEOF_PIXEL 3534 lea r4, [3*r1] 3535 lea r5, [3*r3] 3536 SA8D_16x16 3537 movd eax, m12 3538 RET 3539 3540cglobal pixel_sa8d_64x16, 4,8,13 3541 FIX_STRIDES r1, r3 3542 lea r4, [3*r1] 3543 lea r5, [3*r3] 3544 pxor m12, m12 3545%if vertical == 0 3546 mova m7, [hmul_8p] 3547%endif 3548 SA8D_16x16 3549 lea r4, [8*r1] 3550 lea r5, [8*r3] 3551 sub r0, r4 3552 sub r2, r5 3553 add r2, 16*SIZEOF_PIXEL 3554 add r0, 16*SIZEOF_PIXEL 3555 lea r4, [3*r1] 3556 lea r5, [3*r3] 3557 SA8D_16x16 3558 lea r4, [8*r1] 3559 lea r5, [8*r3] 3560 sub r0, r4 3561 sub r2, r5 3562 add r2, 16*SIZEOF_PIXEL 3563 add r0, 16*SIZEOF_PIXEL 3564 lea r4, [3*r1] 3565 lea r5, [3*r3] 3566 SA8D_16x16 3567 lea r4, [8*r1] 3568 lea r5, [8*r3] 3569 sub r0, r4 3570 sub r2, r5 3571 add r2, 16*SIZEOF_PIXEL 3572 add r0, 16*SIZEOF_PIXEL 3573 lea r4, [3*r1] 3574 lea r5, [3*r3] 3575 SA8D_16x16 3576 movd eax, m12 3577 RET 3578 3579cglobal pixel_sa8d_64x32, 4,8,13 3580 FIX_STRIDES r1, r3 3581 lea r4, [3*r1] 3582 lea r5, [3*r3] 3583 pxor m12, m12 3584%if vertical == 0 3585 mova m7, [hmul_8p] 3586%endif 3587 SA8D_16x16 3588 lea r4, [8*r1] 3589 lea r5, [8*r3] 3590 sub r0, r4 3591 sub r2, r5 3592 add r2, 16*SIZEOF_PIXEL 3593 add r0, 16*SIZEOF_PIXEL 3594 lea r4, [3*r1] 3595 lea r5, [3*r3] 3596 SA8D_16x16 3597 lea r4, [8*r1] 3598 lea r5, [8*r3] 3599 sub r0, r4 3600 sub r2, r5 3601 add r2, 16*SIZEOF_PIXEL 3602 add r0, 16*SIZEOF_PIXEL 3603 lea r4, [3*r1] 3604 lea r5, [3*r3] 3605 SA8D_16x16 3606 lea r4, [8*r1] 3607 lea r5, [8*r3] 3608 sub r0, r4 3609 sub r2, r5 3610 add r2, 16*SIZEOF_PIXEL 3611 add r0, 16*SIZEOF_PIXEL 3612 lea r4, [3*r1] 3613 lea r5, [3*r3] 3614 SA8D_16x16 3615 lea r0, [r0+8*r1] 3616 lea r2, [r2+8*r3] 3617 SA8D_16x16 3618 lea r4, [8*r1] 3619 lea r5, [8*r3] 3620 sub r0, r4 3621 sub r2, r5 3622 sub r2, 16*SIZEOF_PIXEL 3623 sub r0, 16*SIZEOF_PIXEL 3624 lea r4, [3*r1] 3625 lea r5, [3*r3] 3626 SA8D_16x16 3627 lea r4, [8*r1] 3628 lea r5, [8*r3] 3629 sub r0, r4 3630 sub r2, r5 3631 sub r2, 16*SIZEOF_PIXEL 3632 sub r0, 16*SIZEOF_PIXEL 3633 lea r4, [3*r1] 3634 lea r5, [3*r3] 3635 SA8D_16x16 3636 lea r4, [8*r1] 3637 lea r5, [8*r3] 3638 sub r0, r4 3639 sub r2, r5 3640 sub r2, 16*SIZEOF_PIXEL 3641 sub r0, 16*SIZEOF_PIXEL 3642 lea r4, [3*r1] 3643 lea r5, [3*r3] 3644 SA8D_16x16 3645 movd eax, m12 3646 RET 3647 3648cglobal pixel_sa8d_64x48, 4,8,13 3649 FIX_STRIDES r1, r3 3650 lea r4, [3*r1] 3651 lea r5, [3*r3] 3652 pxor m12, m12 3653%if vertical == 0 3654 mova m7, [hmul_8p] 3655%endif 3656 SA8D_16x16 3657 lea r4, [8*r1] 3658 lea r5, [8*r3] 3659 sub r0, r4 3660 sub r2, r5 3661 add r2, 16*SIZEOF_PIXEL 3662 add r0, 16*SIZEOF_PIXEL 3663 lea r4, [3*r1] 3664 lea r5, [3*r3] 3665 SA8D_16x16 3666 lea r4, [8*r1] 3667 lea r5, [8*r3] 3668 sub r0, r4 3669 sub r2, r5 3670 add r2, 16*SIZEOF_PIXEL 3671 add r0, 16*SIZEOF_PIXEL 3672 lea r4, [3*r1] 3673 lea r5, [3*r3] 3674 SA8D_16x16 3675 lea r4, [8*r1] 3676 lea r5, [8*r3] 3677 sub r0, r4 3678 sub r2, r5 3679 add r2, 16*SIZEOF_PIXEL 3680 add r0, 16*SIZEOF_PIXEL 3681 lea r4, [3*r1] 3682 lea r5, [3*r3] 3683 SA8D_16x16 3684 lea r0, [r0+8*r1] 3685 lea r2, [r2+8*r3] 3686 SA8D_16x16 3687 lea r4, [8*r1] 3688 lea r5, [8*r3] 3689 sub r0, r4 3690 sub r2, r5 3691 sub r2, 16*SIZEOF_PIXEL 3692 sub r0, 16*SIZEOF_PIXEL 3693 lea r4, [3*r1] 3694 lea r5, [3*r3] 3695 SA8D_16x16 3696 lea r4, [8*r1] 3697 lea r5, [8*r3] 3698 sub r0, r4 3699 sub r2, r5 3700 sub r2, 16*SIZEOF_PIXEL 3701 sub r0, 16*SIZEOF_PIXEL 3702 lea r4, [3*r1] 3703 lea r5, [3*r3] 3704 SA8D_16x16 3705 lea r4, [8*r1] 3706 lea r5, [8*r3] 3707 sub r0, r4 3708 sub r2, r5 3709 sub r2, 16*SIZEOF_PIXEL 3710 sub r0, 16*SIZEOF_PIXEL 3711 lea r4, [3*r1] 3712 lea r5, [3*r3] 3713 SA8D_16x16 3714 lea r0, [r0+8*r1] 3715 lea r2, [r2+8*r3] 3716 SA8D_16x16 3717 lea r4, [8*r1] 3718 lea r5, [8*r3] 3719 sub r0, r4 3720 sub r2, r5 3721 add r2, 16*SIZEOF_PIXEL 3722 add r0, 16*SIZEOF_PIXEL 3723 lea r4, [3*r1] 3724 lea r5, [3*r3] 3725 SA8D_16x16 3726 lea r4, [8*r1] 3727 lea r5, [8*r3] 3728 sub r0, r4 3729 sub r2, r5 3730 add r2, 16*SIZEOF_PIXEL 3731 add r0, 16*SIZEOF_PIXEL 3732 lea r4, [3*r1] 3733 lea r5, [3*r3] 3734 SA8D_16x16 3735 lea r4, [8*r1] 3736 lea r5, [8*r3] 3737 sub r0, r4 3738 sub r2, r5 3739 add r2, 16*SIZEOF_PIXEL 3740 add r0, 16*SIZEOF_PIXEL 3741 lea r4, [3*r1] 3742 lea r5, [3*r3] 3743 SA8D_16x16 3744 movd eax, m12 3745 RET 3746 3747cglobal pixel_sa8d_64x64, 4,8,13 3748 FIX_STRIDES r1, r3 3749 lea r4, [3*r1] 3750 lea r5, [3*r3] 3751 pxor m12, m12 3752%if vertical == 0 3753 mova m7, [hmul_8p] 3754%endif 3755 SA8D_16x16 3756 lea r4, [8*r1] 3757 lea r5, [8*r3] 3758 sub r0, r4 3759 sub r2, r5 3760 add r2, 16*SIZEOF_PIXEL 3761 add r0, 16*SIZEOF_PIXEL 3762 lea r4, [3*r1] 3763 lea r5, [3*r3] 3764 SA8D_16x16 3765 lea r4, [8*r1] 3766 lea r5, [8*r3] 3767 sub r0, r4 3768 sub r2, r5 3769 add r2, 16*SIZEOF_PIXEL 3770 add r0, 16*SIZEOF_PIXEL 3771 lea r4, [3*r1] 3772 lea r5, [3*r3] 3773 SA8D_16x16 3774 lea r4, [8*r1] 3775 lea r5, [8*r3] 3776 sub r0, r4 3777 sub r2, r5 3778 add r2, 16*SIZEOF_PIXEL 3779 add r0, 16*SIZEOF_PIXEL 3780 lea r4, [3*r1] 3781 lea r5, [3*r3] 3782 SA8D_16x16 3783 lea r0, [r0+8*r1] 3784 lea r2, [r2+8*r3] 3785 SA8D_16x16 3786 lea r4, [8*r1] 3787 lea r5, [8*r3] 3788 sub r0, r4 3789 sub r2, r5 3790 sub r2, 16*SIZEOF_PIXEL 3791 sub r0, 16*SIZEOF_PIXEL 3792 lea r4, [3*r1] 3793 lea r5, [3*r3] 3794 SA8D_16x16 3795 lea r4, [8*r1] 3796 lea r5, [8*r3] 3797 sub r0, r4 3798 sub r2, r5 3799 sub r2, 16*SIZEOF_PIXEL 3800 sub r0, 16*SIZEOF_PIXEL 3801 lea r4, [3*r1] 3802 lea r5, [3*r3] 3803 SA8D_16x16 3804 lea r4, [8*r1] 3805 lea r5, [8*r3] 3806 sub r0, r4 3807 sub r2, r5 3808 sub r2, 16*SIZEOF_PIXEL 3809 sub r0, 16*SIZEOF_PIXEL 3810 lea r4, [3*r1] 3811 lea r5, [3*r3] 3812 SA8D_16x16 3813 lea r0, [r0+8*r1] 3814 lea r2, [r2+8*r3] 3815 SA8D_16x16 3816 lea r4, [8*r1] 3817 lea r5, [8*r3] 3818 sub r0, r4 3819 sub r2, r5 3820 add r2, 16*SIZEOF_PIXEL 3821 add r0, 16*SIZEOF_PIXEL 3822 lea r4, [3*r1] 3823 lea r5, [3*r3] 3824 SA8D_16x16 3825 lea r4, [8*r1] 3826 lea r5, [8*r3] 3827 sub r0, r4 3828 sub r2, r5 3829 add r2, 16*SIZEOF_PIXEL 3830 add r0, 16*SIZEOF_PIXEL 3831 lea r4, [3*r1] 3832 lea r5, [3*r3] 3833 SA8D_16x16 3834 lea r4, [8*r1] 3835 lea r5, [8*r3] 3836 sub r0, r4 3837 sub r2, r5 3838 add r2, 16*SIZEOF_PIXEL 3839 add r0, 16*SIZEOF_PIXEL 3840 lea r4, [3*r1] 3841 lea r5, [3*r3] 3842 SA8D_16x16 3843 lea r0, [r0+8*r1] 3844 lea r2, [r2+8*r3] 3845 SA8D_16x16 3846 lea r4, [8*r1] 3847 lea r5, [8*r3] 3848 sub r0, r4 3849 sub r2, r5 3850 sub r2, 16*SIZEOF_PIXEL 3851 sub r0, 16*SIZEOF_PIXEL 3852 lea r4, [3*r1] 3853 lea r5, [3*r3] 3854 SA8D_16x16 3855 lea r4, [8*r1] 3856 lea r5, [8*r3] 3857 sub r0, r4 3858 sub r2, r5 3859 sub r2, 16*SIZEOF_PIXEL 3860 sub r0, 16*SIZEOF_PIXEL 3861 lea r4, [3*r1] 3862 lea r5, [3*r3] 3863 SA8D_16x16 3864 lea r4, [8*r1] 3865 lea r5, [8*r3] 3866 sub r0, r4 3867 sub r2, r5 3868 sub r2, 16*SIZEOF_PIXEL 3869 sub r0, 16*SIZEOF_PIXEL 3870 lea r4, [3*r1] 3871 lea r5, [3*r3] 3872 SA8D_16x16 3873 movd eax, m12 3874 RET 3875 3876%else ; ARCH_X86_32 3877%if mmsize == 16 3878cglobal pixel_sa8d_8x8_internal 3879 %define spill0 [esp+4] 3880 %define spill1 [esp+20] 3881 %define spill2 [esp+36] 3882%if vertical 3883 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1 3884 HADAMARD4_2D 0, 1, 2, 3, 4 3885 movdqa spill0, m3 3886 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1 3887 HADAMARD4_2D 4, 5, 6, 7, 3 3888 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax 3889 movdqa m3, spill0 3890 paddw m0, m1 3891 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax 3892%else ; mmsize == 8 3893 mova m7, [hmul_8p] 3894 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1 3895 ; could do first HADAMARD4_V here to save spilling later 3896 ; surprisingly, not a win on conroe or even p4 3897 mova spill0, m2 3898 mova spill1, m3 3899 mova spill2, m1 3900 SWAP 1, 7 3901 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1 3902 HADAMARD4_V 4, 5, 6, 7, 3 3903 mova m1, spill2 3904 mova m2, spill0 3905 mova m3, spill1 3906 mova spill0, m6 3907 mova spill1, m7 3908 HADAMARD4_V 0, 1, 2, 3, 7 3909 SUMSUB_BADC w, 0, 4, 1, 5, 7 3910 HADAMARD 2, sumsub, 0, 4, 7, 6 3911 HADAMARD 2, sumsub, 1, 5, 7, 6 3912 HADAMARD 1, amax, 0, 4, 7, 6 3913 HADAMARD 1, amax, 1, 5, 7, 6 3914 mova m6, spill0 3915 mova m7, spill1 3916 paddw m0, m1 3917 SUMSUB_BADC w, 2, 6, 3, 7, 4 3918 HADAMARD 2, sumsub, 2, 6, 4, 5 3919 HADAMARD 2, sumsub, 3, 7, 4, 5 3920 HADAMARD 1, amax, 2, 6, 4, 5 3921 HADAMARD 1, amax, 3, 7, 4, 5 3922%endif ; sse2/non-sse2 3923 paddw m0, m2 3924 paddw m0, m3 3925 SAVE_MM_PERMUTATION 3926 ret 3927%endif ; ifndef mmx2 3928 3929cglobal pixel_sa8d_8x8_internal2 3930 %define spill0 [esp+4] 3931 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1 3932 HADAMARD4_2D 0, 1, 2, 3, 4 3933 movdqa spill0, m3 3934 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1 3935 HADAMARD4_2D 4, 5, 6, 7, 3 3936 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax 3937 movdqa m3, spill0 3938 paddw m0, m1 3939 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax 3940 paddw m0, m2 3941 paddw m0, m3 3942 SAVE_MM_PERMUTATION 3943 ret 3944 3945cglobal pixel_sa8d_8x8, 4,7 3946 FIX_STRIDES r1, r3 3947 mov r6, esp 3948 and esp, ~15 3949 sub esp, 48 3950 lea r4, [3*r1] 3951 lea r5, [3*r3] 3952 call pixel_sa8d_8x8_internal 3953%if HIGH_BIT_DEPTH 3954 HADDUW m0, m1 3955%else 3956 HADDW m0, m1 3957%endif ; HIGH_BIT_DEPTH 3958 movd eax, m0 3959 add eax, 1 3960 shr eax, 1 3961 mov esp, r6 3962 RET 3963 3964cglobal pixel_sa8d_16x16, 4,7 3965 FIX_STRIDES r1, r3 3966 mov r6, esp 3967 and esp, ~15 3968 sub esp, 64 3969 lea r4, [3*r1] 3970 lea r5, [3*r3] 3971 call pixel_sa8d_8x8_internal 3972%if mmsize == 8 3973 lea r0, [r0+4*r1] 3974 lea r2, [r2+4*r3] 3975%endif 3976%if HIGH_BIT_DEPTH 3977 HADDUW m0, m1 3978%endif 3979 mova [esp+48], m0 3980 call pixel_sa8d_8x8_internal 3981 mov r0, [r6+20] 3982 mov r2, [r6+28] 3983 add r0, 8*SIZEOF_PIXEL 3984 add r2, 8*SIZEOF_PIXEL 3985 SA8D_INTER 3986 mova [esp+48], m0 3987 call pixel_sa8d_8x8_internal 3988%if mmsize == 8 3989 lea r0, [r0+4*r1] 3990 lea r2, [r2+4*r3] 3991%else 3992 SA8D_INTER 3993%endif 3994 mova [esp+64-mmsize], m0 3995 call pixel_sa8d_8x8_internal 3996%if HIGH_BIT_DEPTH 3997 SA8D_INTER 3998%else ; !HIGH_BIT_DEPTH 3999 paddusw m0, [esp+64-mmsize] 4000%if mmsize == 16 4001 HADDUW m0, m1 4002%else 4003 mova m2, [esp+48] 4004 pxor m7, m7 4005 mova m1, m0 4006 mova m3, m2 4007 punpcklwd m0, m7 4008 punpckhwd m1, m7 4009 punpcklwd m2, m7 4010 punpckhwd m3, m7 4011 paddd m0, m1 4012 paddd m2, m3 4013 paddd m0, m2 4014 HADDD m0, m1 4015%endif 4016%endif ; HIGH_BIT_DEPTH 4017 movd eax, m0 4018 add eax, 1 4019 shr eax, 1 4020 mov esp, r6 4021 RET 4022 4023cglobal pixel_sa8d_8x16, 4,7,8 4024 FIX_STRIDES r1, r3 4025 mov r6, esp 4026 and esp, ~15 4027 sub esp, 64 4028 4029 lea r4, [r1 + 2*r1] 4030 lea r5, [r3 + 2*r3] 4031 call pixel_sa8d_8x8_internal2 4032 HADDUW m0, m1 4033 movd r4d, m0 4034 add r4d, 1 4035 shr r4d, 1 4036 mov dword [esp+36], r4d 4037 4038 mov r0, [r6+20] 4039 mov r2, [r6+28] 4040 lea r0, [r0 + r1*8] 4041 lea r2, [r2 + r3*8] 4042 lea r4, [r1 + 2*r1] 4043 call pixel_sa8d_8x8_internal2 4044 HADDUW m0, m1 4045 movd r4d, m0 4046 add r4d, 1 4047 shr r4d, 1 4048 add r4d, dword [esp+36] 4049 mov eax, r4d 4050 mov esp, r6 4051 RET 4052 4053cglobal pixel_sa8d_8x32, 4,7,8 4054 FIX_STRIDES r1, r3 4055 mov r6, esp 4056 and esp, ~15 4057 sub esp, 64 4058 4059 lea r4, [r1 + 2*r1] 4060 lea r5, [r3 + 2*r3] 4061 call pixel_sa8d_8x8_internal2 4062 HADDUW m0, m1 4063 movd r4d, m0 4064 add r4d, 1 4065 shr r4d, 1 4066 mov dword [esp+36], r4d 4067 4068 mov r0, [r6+20] 4069 mov r2, [r6+28] 4070 lea r0, [r0 + r1*8] 4071 lea r2, [r2 + r3*8] 4072 lea r4, [r1 + 2*r1] 4073 call pixel_sa8d_8x8_internal2 4074 HADDUW m0, m1 4075 movd r4d, m0 4076 add r4d, 1 4077 shr r4d, 1 4078 add r4d, dword [esp+36] 4079 mov dword [esp+36], r4d 4080 4081 mov r0, [r6+20] 4082 mov r2, [r6+28] 4083 lea r0, [r0 + r1*8] 4084 lea r2, [r2 + r3*8] 4085 lea r0, [r0 + r1*8] 4086 lea r2, [r2 + r3*8] 4087 lea r4, [r1 + 2*r1] 4088 call pixel_sa8d_8x8_internal2 4089 HADDUW m0, m1 4090 movd r4d, m0 4091 add r4d, 1 4092 shr r4d, 1 4093 add r4d, dword [esp+36] 4094 mov dword [esp+36], r4d 4095 4096 mov r0, [r6+20] 4097 mov r2, [r6+28] 4098 lea r0, [r0 + r1*8] 4099 lea r2, [r2 + r3*8] 4100 lea r0, [r0 + r1*8] 4101 lea r2, [r2 + r3*8] 4102 lea r0, [r0 + r1*8] 4103 lea r2, [r2 + r3*8] 4104 lea r4, [r1 + 2*r1] 4105 call pixel_sa8d_8x8_internal2 4106 HADDUW m0, m1 4107 movd r4d, m0 4108 add r4d, 1 4109 shr r4d, 1 4110 add r4d, dword [esp+36] 4111 mov eax, r4d 4112 mov esp, r6 4113 RET 4114 4115cglobal pixel_sa8d_16x8, 4,7,8 4116 FIX_STRIDES r1, r3 4117 mov r6, esp 4118 and esp, ~15 4119 sub esp, 64 4120 4121 lea r4, [r1 + 2*r1] 4122 lea r5, [r3 + 2*r3] 4123 call pixel_sa8d_8x8_internal2 4124 HADDUW m0, m1 4125 movd r4d, m0 4126 add r4d, 1 4127 shr r4d, 1 4128 mov dword [esp+36], r4d 4129 4130 mov r0, [r6+20] 4131 mov r2, [r6+28] 4132 add r0, 8*SIZEOF_PIXEL 4133 add r2, 8*SIZEOF_PIXEL 4134 lea r4, [r1 + 2*r1] 4135 call pixel_sa8d_8x8_internal2 4136 HADDUW m0, m1 4137 movd r4d, m0 4138 add r4d, 1 4139 shr r4d, 1 4140 add r4d, dword [esp+36] 4141 mov eax, r4d 4142 mov esp, r6 4143 RET 4144 4145cglobal pixel_sa8d_16x32, 4,7,8 4146 FIX_STRIDES r1, r3 4147 mov r6, esp 4148 and esp, ~15 4149 sub esp, 64 4150 4151 lea r4, [r1 + 2*r1] 4152 lea r5, [r3 + 2*r3] 4153 call pixel_sa8d_8x8_internal2 4154%if HIGH_BIT_DEPTH 4155 HADDUW m0, m1 4156%endif 4157 mova [rsp+48], m0 4158 call pixel_sa8d_8x8_internal2 4159 SA8D_INTER 4160 mova [esp+48], m0 4161 4162 mov r0, [r6+20] 4163 mov r2, [r6+28] 4164 add r0, 8*SIZEOF_PIXEL 4165 add r2, 8*SIZEOF_PIXEL 4166 call pixel_sa8d_8x8_internal2 4167 SA8D_INTER 4168 mova [esp+48], m0 4169 call pixel_sa8d_8x8_internal2 4170 SA8D_INTER 4171%if HIGH_BIT_DEPTH == 0 4172 HADDUW m0, m1 4173%endif 4174 movd r4d, m0 4175 add r4d, 1 4176 shr r4d, 1 4177 mov dword [esp+36], r4d 4178 4179 mov r0, [r6+20] 4180 mov r2, [r6+28] 4181 lea r0, [r0 + r1*8] 4182 lea r2, [r2 + r3*8] 4183 lea r0, [r0 + r1*8] 4184 lea r2, [r2 + r3*8] 4185 lea r4, [r1 + 2*r1] 4186 call pixel_sa8d_8x8_internal2 4187%if HIGH_BIT_DEPTH 4188 HADDUW m0, m1 4189%endif 4190 mova [esp+48], m0 4191 call pixel_sa8d_8x8_internal2 4192 SA8D_INTER 4193 mova [esp+48], m0 4194 4195 mov r0, [r6+20] 4196 mov r2, [r6+28] 4197 lea r0, [r0 + r1*8] 4198 lea r2, [r2 + r3*8] 4199 lea r0, [r0 + r1*8] 4200 lea r2, [r2 + r3*8] 4201 add r0, 8*SIZEOF_PIXEL 4202 add r2, 8*SIZEOF_PIXEL 4203 call pixel_sa8d_8x8_internal2 4204 SA8D_INTER 4205 mova [esp+48], m0 4206 call pixel_sa8d_8x8_internal2 4207 SA8D_INTER 4208%if HIGH_BIT_DEPTH == 0 4209 HADDUW m0, m1 4210%endif 4211 movd r4d, m0 4212 add r4d, 1 4213 shr r4d, 1 4214 add r4d, dword [esp+36] 4215 mov eax, r4d 4216 mov esp, r6 4217 RET 4218 4219cglobal pixel_sa8d_16x64, 4,7,8 4220 FIX_STRIDES r1, r3 4221 mov r6, esp 4222 and esp, ~15 4223 sub esp, 64 4224 4225 lea r4, [r1 + 2*r1] 4226 lea r5, [r3 + 2*r3] 4227 call pixel_sa8d_8x8_internal2 4228%if HIGH_BIT_DEPTH 4229 HADDUW m0, m1 4230%endif 4231 mova [rsp+48], m0 4232 call pixel_sa8d_8x8_internal2 4233 SA8D_INTER 4234 mova [esp+48], m0 4235 4236 mov r0, [r6+20] 4237 mov r2, [r6+28] 4238 add r0, 8*SIZEOF_PIXEL 4239 add r2, 8*SIZEOF_PIXEL 4240 call pixel_sa8d_8x8_internal2 4241 SA8D_INTER 4242 mova [esp+48], m0 4243 call pixel_sa8d_8x8_internal2 4244 SA8D_INTER 4245%if HIGH_BIT_DEPTH == 0 4246 HADDUW m0, m1 4247%endif 4248 movd r4d, m0 4249 add r4d, 1 4250 shr r4d, 1 4251 mov dword [esp+36], r4d 4252 4253 mov r0, [r6+20] 4254 mov r2, [r6+28] 4255 lea r0, [r0 + r1*8] 4256 lea r2, [r2 + r3*8] 4257 lea r0, [r0 + r1*8] 4258 lea r2, [r2 + r3*8] 4259 mov [r6+20], r0 4260 mov [r6+28], r2 4261 4262 lea r4, [r1 + 2*r1] 4263 call pixel_sa8d_8x8_internal2 4264%if HIGH_BIT_DEPTH 4265 HADDUW m0, m1 4266%endif 4267 mova [esp+48], m0 4268 call pixel_sa8d_8x8_internal2 4269 SA8D_INTER 4270 mova [esp+48], m0 4271 4272 mov r0, [r6+20] 4273 mov r2, [r6+28] 4274 add r0, 8*SIZEOF_PIXEL 4275 add r2, 8*SIZEOF_PIXEL 4276 call pixel_sa8d_8x8_internal2 4277 SA8D_INTER 4278 mova [esp+64-mmsize], m0 4279 call pixel_sa8d_8x8_internal2 4280 AVG_16x16 4281 4282 mov r0, [r6+20] 4283 mov r2, [r6+28] 4284 lea r0, [r0 + r1*8] 4285 lea r2, [r2 + r3*8] 4286 lea r0, [r0 + r1*8] 4287 lea r2, [r2 + r3*8] 4288 mov [r6+20], r0 4289 mov [r6+28], r2 4290 4291 lea r4, [r1 + 2*r1] 4292 call pixel_sa8d_8x8_internal2 4293%if HIGH_BIT_DEPTH 4294 HADDUW m0, m1 4295%endif 4296 mova [esp+48], m0 4297 call pixel_sa8d_8x8_internal2 4298 SA8D_INTER 4299 mova [esp+48], m0 4300 4301 mov r0, [r6+20] 4302 mov r2, [r6+28] 4303 add r0, 8*SIZEOF_PIXEL 4304 add r2, 8*SIZEOF_PIXEL 4305 call pixel_sa8d_8x8_internal2 4306 SA8D_INTER 4307 mova [esp+64-mmsize], m0 4308 call pixel_sa8d_8x8_internal2 4309 AVG_16x16 4310 4311 mov r0, [r6+20] 4312 mov r2, [r6+28] 4313 lea r0, [r0 + r1*8] 4314 lea r2, [r2 + r3*8] 4315 lea r0, [r0 + r1*8] 4316 lea r2, [r2 + r3*8] 4317 mov [r6+20], r0 4318 mov [r6+28], r2 4319 4320 lea r4, [r1 + 2*r1] 4321 call pixel_sa8d_8x8_internal2 4322%if HIGH_BIT_DEPTH 4323 HADDUW m0, m1 4324%endif 4325 mova [esp+48], m0 4326 call pixel_sa8d_8x8_internal2 4327 SA8D_INTER 4328 mova [esp+48], m0 4329 4330 mov r0, [r6+20] 4331 mov r2, [r6+28] 4332 add r0, 8*SIZEOF_PIXEL 4333 add r2, 8*SIZEOF_PIXEL 4334 call pixel_sa8d_8x8_internal2 4335 SA8D_INTER 4336 mova [esp+64-mmsize], m0 4337 call pixel_sa8d_8x8_internal2 4338 SA8D_INTER 4339%if HIGH_BIT_DEPTH == 0 4340 HADDUW m0, m1 4341%endif 4342 movd r4d, m0 4343 add r4d, 1 4344 shr r4d, 1 4345 add r4d, dword [esp+36] 4346 mov eax, r4d 4347 mov esp, r6 4348 RET 4349 4350cglobal pixel_sa8d_24x32, 4,7,8 4351 FIX_STRIDES r1, r3 4352 mov r6, esp 4353 and esp, ~15 4354 sub esp, 64 4355 4356 lea r4, [r1 + 2*r1] 4357 lea r5, [r3 + 2*r3] 4358 call pixel_sa8d_8x8_internal2 4359 HADDUW m0, m1 4360 movd r4d, m0 4361 add r4d, 1 4362 shr r4d, 1 4363 mov dword [esp+36], r4d 4364 4365 mov r0, [r6+20] 4366 mov r2, [r6+28] 4367 add r0, 8*SIZEOF_PIXEL 4368 add r2, 8*SIZEOF_PIXEL 4369 lea r4, [r1 + 2*r1] 4370 call pixel_sa8d_8x8_internal2 4371 HADDUW m0, m1 4372 movd r4d, m0 4373 add r4d, 1 4374 shr r4d, 1 4375 add r4d, dword [esp+36] 4376 mov dword [esp+36], r4d 4377 4378 mov r0, [r6+20] 4379 mov r2, [r6+28] 4380 add r0, 16*SIZEOF_PIXEL 4381 add r2, 16*SIZEOF_PIXEL 4382 lea r4, [r1 + 2*r1] 4383 call pixel_sa8d_8x8_internal2 4384 HADDUW m0, m1 4385 movd r4d, m0 4386 add r4d, 1 4387 shr r4d, 1 4388 add r4d, dword [esp+36] 4389 mov dword [esp+36], r4d 4390 4391 mov r0, [r6+20] 4392 mov r2, [r6+28] 4393 lea r0, [r0 + r1*8] 4394 lea r2, [r2 + r3*8] 4395 mov [r6+20], r0 4396 mov [r6+28], r2 4397 lea r4, [r1 + 2*r1] 4398 call pixel_sa8d_8x8_internal2 4399 HADDUW m0, m1 4400 movd r4d, m0 4401 add r4d, 1 4402 shr r4d, 1 4403 add r4d, dword [esp+36] 4404 mov dword [esp+36], r4d 4405 4406 mov r0, [r6+20] 4407 mov r2, [r6+28] 4408 add r0, 8*SIZEOF_PIXEL 4409 add r2, 8*SIZEOF_PIXEL 4410 lea r4, [r1 + 2*r1] 4411 call pixel_sa8d_8x8_internal2 4412 HADDUW m0, m1 4413 movd r4d, m0 4414 add r4d, 1 4415 shr r4d, 1 4416 add r4d, dword [esp+36] 4417 mov dword [esp+36], r4d 4418 4419 mov r0, [r6+20] 4420 mov r2, [r6+28] 4421 add r0, 16*SIZEOF_PIXEL 4422 add r2, 16*SIZEOF_PIXEL 4423 lea r4, [r1 + 2*r1] 4424 call pixel_sa8d_8x8_internal2 4425 HADDUW m0, m1 4426 movd r4d, m0 4427 add r4d, 1 4428 shr r4d, 1 4429 add r4d, dword [esp+36] 4430 mov dword [esp+36], r4d 4431 4432 mov r0, [r6+20] 4433 mov r2, [r6+28] 4434 lea r0, [r0 + r1*8] 4435 lea r2, [r2 + r3*8] 4436 mov [r6+20], r0 4437 mov [r6+28], r2 4438 lea r4, [r1 + 2*r1] 4439 call pixel_sa8d_8x8_internal2 4440 HADDUW m0, m1 4441 movd r4d, m0 4442 add r4d, 1 4443 shr r4d, 1 4444 add r4d, dword [esp+36] 4445 mov dword [esp+36], r4d 4446 4447 mov r0, [r6+20] 4448 mov r2, [r6+28] 4449 add r0, 8*SIZEOF_PIXEL 4450 add r2, 8*SIZEOF_PIXEL 4451 lea r4, [r1 + 2*r1] 4452 call pixel_sa8d_8x8_internal2 4453 HADDUW m0, m1 4454 movd r4d, m0 4455 add r4d, 1 4456 shr r4d, 1 4457 add r4d, dword [esp+36] 4458 mov dword [esp+36], r4d 4459 4460 mov r0, [r6+20] 4461 mov r2, [r6+28] 4462 add r0, 16*SIZEOF_PIXEL 4463 add r2, 16*SIZEOF_PIXEL 4464 lea r4, [r1 + 2*r1] 4465 call pixel_sa8d_8x8_internal2 4466 HADDUW m0, m1 4467 movd r4d, m0 4468 add r4d, 1 4469 shr r4d, 1 4470 add r4d, dword [esp+36] 4471 mov dword [esp+36], r4d 4472 4473 mov r0, [r6+20] 4474 mov r2, [r6+28] 4475 lea r0, [r0 + r1*8] 4476 lea r2, [r2 + r3*8] 4477 mov [r6+20], r0 4478 mov [r6+28], r2 4479 lea r4, [r1 + 2*r1] 4480 call pixel_sa8d_8x8_internal2 4481 HADDUW m0, m1 4482 movd r4d, m0 4483 add r4d, 1 4484 shr r4d, 1 4485 add r4d, dword [esp+36] 4486 mov dword [esp+36], r4d 4487 4488 mov r0, [r6+20] 4489 mov r2, [r6+28] 4490 add r0, 8*SIZEOF_PIXEL 4491 add r2, 8*SIZEOF_PIXEL 4492 lea r4, [r1 + 2*r1] 4493 call pixel_sa8d_8x8_internal2 4494 HADDUW m0, m1 4495 movd r4d, m0 4496 add r4d, 1 4497 shr r4d, 1 4498 add r4d, dword [esp+36] 4499 mov dword [esp+36], r4d 4500 4501 mov r0, [r6+20] 4502 mov r2, [r6+28] 4503 add r0, 16*SIZEOF_PIXEL 4504 add r2, 16*SIZEOF_PIXEL 4505 lea r4, [r1 + 2*r1] 4506 call pixel_sa8d_8x8_internal2 4507 HADDUW m0, m1 4508 movd r4d, m0 4509 add r4d, 1 4510 shr r4d, 1 4511 add r4d, dword [esp+36] 4512 mov eax, r4d 4513 mov esp, r6 4514 RET 4515 4516cglobal pixel_sa8d_32x8, 4,7,8 4517 FIX_STRIDES r1, r3 4518 mov r6, esp 4519 and esp, ~15 4520 sub esp, 64 4521 4522 lea r4, [r1 + 2*r1] 4523 lea r5, [r3 + 2*r3] 4524 call pixel_sa8d_8x8_internal2 4525 HADDUW m0, m1 4526 movd r4d, m0 4527 add r4d, 1 4528 shr r4d, 1 4529 mov dword [esp+36], r4d 4530 4531 mov r0, [r6+20] 4532 mov r2, [r6+28] 4533 add r0, 8*SIZEOF_PIXEL 4534 add r2, 8*SIZEOF_PIXEL 4535 lea r4, [r1 + 2*r1] 4536 call pixel_sa8d_8x8_internal2 4537 HADDUW m0, m1 4538 movd r4d, m0 4539 add r4d, 1 4540 shr r4d, 1 4541 add r4d, dword [esp+36] 4542 mov dword [esp+36], r4d 4543 4544 mov r0, [r6+20] 4545 mov r2, [r6+28] 4546 add r0, 16*SIZEOF_PIXEL 4547 add r2, 16*SIZEOF_PIXEL 4548 lea r4, [r1 + 2*r1] 4549 call pixel_sa8d_8x8_internal2 4550 HADDUW m0, m1 4551 movd r4d, m0 4552 add r4d, 1 4553 shr r4d, 1 4554 add r4d, dword [esp+36] 4555 mov dword [esp+36], r4d 4556 4557 mov r0, [r6+20] 4558 mov r2, [r6+28] 4559 add r0, 24*SIZEOF_PIXEL 4560 add r2, 24*SIZEOF_PIXEL 4561 lea r4, [r1 + 2*r1] 4562 call pixel_sa8d_8x8_internal2 4563 HADDUW m0, m1 4564 movd r4d, m0 4565 add r4d, 1 4566 shr r4d, 1 4567 add r4d, dword [esp+36] 4568 mov eax, r4d 4569 mov esp, r6 4570 RET 4571 4572cglobal pixel_sa8d_32x16, 4,7,8 4573 FIX_STRIDES r1, r3 4574 mov r6, esp 4575 and esp, ~15 4576 sub esp, 64 4577 4578 lea r4, [r1 + 2*r1] 4579 lea r5, [r3 + 2*r3] 4580 call pixel_sa8d_8x8_internal2 4581%if HIGH_BIT_DEPTH 4582 HADDUW m0, m1 4583%endif 4584 mova [rsp+48], m0 4585 call pixel_sa8d_8x8_internal2 4586 SA8D_INTER 4587 mova [esp+48], m0 4588 4589 mov r0, [r6+20] 4590 mov r2, [r6+28] 4591 add r0, 8*SIZEOF_PIXEL 4592 add r2, 8*SIZEOF_PIXEL 4593 call pixel_sa8d_8x8_internal2 4594 SA8D_INTER 4595 mova [esp+48], m0 4596 call pixel_sa8d_8x8_internal2 4597 SA8D_INTER 4598%if HIGH_BIT_DEPTH == 0 4599 HADDUW m0, m1 4600%endif 4601 movd r4d, m0 4602 add r4d, 1 4603 shr r4d, 1 4604 mov dword [esp+36], r4d 4605 4606 mov r0, [r6+20] 4607 mov r2, [r6+28] 4608 add r0, 16*SIZEOF_PIXEL 4609 add r2, 16*SIZEOF_PIXEL 4610 lea r4, [r1 + 2*r1] 4611 call pixel_sa8d_8x8_internal2 4612%if HIGH_BIT_DEPTH 4613 HADDUW m0, m1 4614%endif 4615 mova [esp+48], m0 4616 call pixel_sa8d_8x8_internal2 4617 SA8D_INTER 4618 mova [esp+48], m0 4619 4620 mov r0, [r6+20] 4621 mov r2, [r6+28] 4622 add r0, 24*SIZEOF_PIXEL 4623 add r2, 24*SIZEOF_PIXEL 4624 call pixel_sa8d_8x8_internal2 4625 SA8D_INTER 4626 mova [esp+64-mmsize], m0 4627 call pixel_sa8d_8x8_internal2 4628 SA8D_INTER 4629%if HIGH_BIT_DEPTH == 0 4630 HADDUW m0, m1 4631%endif 4632 movd r4d, m0 4633 add r4d, 1 4634 shr r4d, 1 4635 add r4d, dword [esp+36] 4636 mov eax, r4d 4637 mov esp, r6 4638 RET 4639 4640cglobal pixel_sa8d_32x24, 4,7,8 4641 FIX_STRIDES r1, r3 4642 mov r6, esp 4643 and esp, ~15 4644 sub esp, 64 4645 4646 lea r4, [r1 + 2*r1] 4647 lea r5, [r3 + 2*r3] 4648 call pixel_sa8d_8x8_internal2 4649 HADDUW m0, m1 4650 movd r4d, m0 4651 add r4d, 1 4652 shr r4d, 1 4653 mov dword [esp+36], r4d 4654 4655 mov r0, [r6+20] 4656 mov r2, [r6+28] 4657 add r0, 8*SIZEOF_PIXEL 4658 add r2, 8*SIZEOF_PIXEL 4659 lea r4, [r1 + 2*r1] 4660 call pixel_sa8d_8x8_internal2 4661 HADDUW m0, m1 4662 movd r4d, m0 4663 add r4d, 1 4664 shr r4d, 1 4665 add r4d, dword [esp+36] 4666 mov dword [esp+36], r4d 4667 4668 mov r0, [r6+20] 4669 mov r2, [r6+28] 4670 add r0, 16*SIZEOF_PIXEL 4671 add r2, 16*SIZEOF_PIXEL 4672 lea r4, [r1 + 2*r1] 4673 call pixel_sa8d_8x8_internal2 4674 HADDUW m0, m1 4675 movd r4d, m0 4676 add r4d, 1 4677 shr r4d, 1 4678 add r4d, dword [esp+36] 4679 mov dword [esp+36], r4d 4680 4681 mov r0, [r6+20] 4682 mov r2, [r6+28] 4683 add r0, 24*SIZEOF_PIXEL 4684 add r2, 24*SIZEOF_PIXEL 4685 lea r4, [r1 + 2*r1] 4686 call pixel_sa8d_8x8_internal2 4687 HADDUW m0, m1 4688 movd r4d, m0 4689 add r4d, 1 4690 shr r4d, 1 4691 add r4d, dword [esp+36] 4692 mov dword [esp+36], r4d 4693 4694 mov r0, [r6+20] 4695 mov r2, [r6+28] 4696 lea r0, [r0 + r1*8] 4697 lea r2, [r2 + r3*8] 4698 mov [r6+20], r0 4699 mov [r6+28], r2 4700 lea r4, [r1 + 2*r1] 4701 call pixel_sa8d_8x8_internal2 4702 HADDUW m0, m1 4703 movd r4d, m0 4704 add r4d, 1 4705 shr r4d, 1 4706 add r4d, dword [esp+36] 4707 mov dword [esp+36], r4d 4708 4709 mov r0, [r6+20] 4710 mov r2, [r6+28] 4711 add r0, 8*SIZEOF_PIXEL 4712 add r2, 8*SIZEOF_PIXEL 4713 lea r4, [r1 + 2*r1] 4714 call pixel_sa8d_8x8_internal2 4715 HADDUW m0, m1 4716 movd r4d, m0 4717 add r4d, 1 4718 shr r4d, 1 4719 add r4d, dword [esp+36] 4720 mov dword [esp+36], r4d 4721 4722 mov r0, [r6+20] 4723 mov r2, [r6+28] 4724 add r0, 16*SIZEOF_PIXEL 4725 add r2, 16*SIZEOF_PIXEL 4726 lea r4, [r1 + 2*r1] 4727 call pixel_sa8d_8x8_internal2 4728 HADDUW m0, m1 4729 movd r4d, m0 4730 add r4d, 1 4731 shr r4d, 1 4732 add r4d, dword [esp+36] 4733 mov dword [esp+36], r4d 4734 4735 mov r0, [r6+20] 4736 mov r2, [r6+28] 4737 add r0, 24*SIZEOF_PIXEL 4738 add r2, 24*SIZEOF_PIXEL 4739 lea r4, [r1 + 2*r1] 4740 call pixel_sa8d_8x8_internal2 4741 HADDUW m0, m1 4742 movd r4d, m0 4743 add r4d, 1 4744 shr r4d, 1 4745 add r4d, dword [esp+36] 4746 mov dword [esp+36], r4d 4747 4748 mov r0, [r6+20] 4749 mov r2, [r6+28] 4750 lea r0, [r0 + r1*8] 4751 lea r2, [r2 + r3*8] 4752 mov [r6+20], r0 4753 mov [r6+28], r2 4754 lea r4, [r1 + 2*r1] 4755 call pixel_sa8d_8x8_internal2 4756 HADDUW m0, m1 4757 movd r4d, m0 4758 add r4d, 1 4759 shr r4d, 1 4760 add r4d, dword [esp+36] 4761 mov dword [esp+36], r4d 4762 4763 mov r0, [r6+20] 4764 mov r2, [r6+28] 4765 add r0, 8*SIZEOF_PIXEL 4766 add r2, 8*SIZEOF_PIXEL 4767 lea r4, [r1 + 2*r1] 4768 call pixel_sa8d_8x8_internal2 4769 HADDUW m0, m1 4770 movd r4d, m0 4771 add r4d, 1 4772 shr r4d, 1 4773 add r4d, dword [esp+36] 4774 mov dword [esp+36], r4d 4775 4776 mov r0, [r6+20] 4777 mov r2, [r6+28] 4778 add r0, 16*SIZEOF_PIXEL 4779 add r2, 16*SIZEOF_PIXEL 4780 lea r4, [r1 + 2*r1] 4781 call pixel_sa8d_8x8_internal2 4782 HADDUW m0, m1 4783 movd r4d, m0 4784 add r4d, 1 4785 shr r4d, 1 4786 add r4d, dword [esp+36] 4787 mov dword [esp+36], r4d 4788 4789 mov r0, [r6+20] 4790 mov r2, [r6+28] 4791 add r0, 24*SIZEOF_PIXEL 4792 add r2, 24*SIZEOF_PIXEL 4793 lea r4, [r1 + 2*r1] 4794 call pixel_sa8d_8x8_internal2 4795 HADDUW m0, m1 4796 movd r4d, m0 4797 add r4d, 1 4798 shr r4d, 1 4799 add r4d, dword [esp+36] 4800 mov eax, r4d 4801 mov esp, r6 4802 RET 4803 4804cglobal pixel_sa8d_32x32, 4,7,8 4805 FIX_STRIDES r1, r3 4806 mov r6, esp 4807 and esp, ~15 4808 sub esp, 64 4809 4810 lea r4, [r1 + 2*r1] 4811 lea r5, [r3 + 2*r3] 4812 call pixel_sa8d_8x8_internal2 4813%if HIGH_BIT_DEPTH 4814 HADDUW m0, m1 4815%endif 4816 mova [rsp+48], m0 4817 call pixel_sa8d_8x8_internal2 4818 SA8D_INTER 4819 mova [esp+48], m0 4820 4821 mov r0, [r6+20] 4822 mov r2, [r6+28] 4823 add r0, 8*SIZEOF_PIXEL 4824 add r2, 8*SIZEOF_PIXEL 4825 call pixel_sa8d_8x8_internal2 4826 SA8D_INTER 4827 mova [esp+48], m0 4828 call pixel_sa8d_8x8_internal2 4829 SA8D_INTER 4830%if HIGH_BIT_DEPTH == 0 4831 HADDUW m0, m1 4832%endif 4833 movd r4d, m0 4834 add r4d, 1 4835 shr r4d, 1 4836 mov dword [esp+36], r4d 4837 4838 mov r0, [r6+20] 4839 mov r2, [r6+28] 4840 add r0, 16*SIZEOF_PIXEL 4841 add r2, 16*SIZEOF_PIXEL 4842 lea r4, [r1 + 2*r1] 4843 call pixel_sa8d_8x8_internal2 4844%if HIGH_BIT_DEPTH 4845 HADDUW m0, m1 4846%endif 4847 mova [esp+48], m0 4848 call pixel_sa8d_8x8_internal2 4849 SA8D_INTER 4850 mova [esp+48], m0 4851 4852 mov r0, [r6+20] 4853 mov r2, [r6+28] 4854 add r0, 24*SIZEOF_PIXEL 4855 add r2, 24*SIZEOF_PIXEL 4856 call pixel_sa8d_8x8_internal2 4857 SA8D_INTER 4858 mova [esp+64-mmsize], m0 4859 call pixel_sa8d_8x8_internal2 4860 AVG_16x16 4861 4862 mov r0, [r6+20] 4863 mov r2, [r6+28] 4864 lea r0, [r0 + r1*8] 4865 lea r2, [r2 + r3*8] 4866 lea r0, [r0 + r1*8] 4867 lea r2, [r2 + r3*8] 4868 lea r4, [r1 + 2*r1] 4869 call pixel_sa8d_8x8_internal2 4870%if HIGH_BIT_DEPTH 4871 HADDUW m0, m1 4872%endif 4873 mova [esp+48], m0 4874 call pixel_sa8d_8x8_internal2 4875 SA8D_INTER 4876 mova [esp+48], m0 4877 4878 mov r0, [r6+20] 4879 mov r2, [r6+28] 4880 lea r0, [r0 + r1*8] 4881 lea r2, [r2 + r3*8] 4882 lea r0, [r0 + r1*8] 4883 lea r2, [r2 + r3*8] 4884 add r0, 8*SIZEOF_PIXEL 4885 add r2, 8*SIZEOF_PIXEL 4886 call pixel_sa8d_8x8_internal2 4887 SA8D_INTER 4888 mova [esp+64-mmsize], m0 4889 call pixel_sa8d_8x8_internal2 4890 AVG_16x16 4891 4892 mov r0, [r6+20] 4893 mov r2, [r6+28] 4894 lea r0, [r0 + r1*8] 4895 lea r2, [r2 + r3*8] 4896 lea r0, [r0 + r1*8] 4897 lea r2, [r2 + r3*8] 4898 add r0, 16*SIZEOF_PIXEL 4899 add r2, 16*SIZEOF_PIXEL 4900 lea r4, [r1 + 2*r1] 4901 call pixel_sa8d_8x8_internal2 4902%if HIGH_BIT_DEPTH 4903 HADDUW m0, m1 4904%endif 4905 mova [esp+48], m0 4906 call pixel_sa8d_8x8_internal2 4907 SA8D_INTER 4908 mova [esp+48], m0 4909 4910 mov r0, [r6+20] 4911 mov r2, [r6+28] 4912 lea r0, [r0 + r1*8] 4913 lea r2, [r2 + r3*8] 4914 lea r0, [r0 + r1*8] 4915 lea r2, [r2 + r3*8] 4916 add r0, 24*SIZEOF_PIXEL 4917 add r2, 24*SIZEOF_PIXEL 4918 call pixel_sa8d_8x8_internal2 4919 SA8D_INTER 4920 mova [esp+64-mmsize], m0 4921 call pixel_sa8d_8x8_internal2 4922 SA8D_INTER 4923%if HIGH_BIT_DEPTH == 0 4924 HADDUW m0, m1 4925%endif 4926 movd r4d, m0 4927 add r4d, 1 4928 shr r4d, 1 4929 add r4d, dword [esp+36] 4930 mov eax, r4d 4931 mov esp, r6 4932 RET 4933 4934cglobal pixel_sa8d_32x64, 4,7,8 4935 FIX_STRIDES r1, r3 4936 mov r6, esp 4937 and esp, ~15 4938 sub esp, 64 4939 4940 lea r4, [r1 + 2*r1] 4941 lea r5, [r3 + 2*r3] 4942 call pixel_sa8d_8x8_internal2 4943%if HIGH_BIT_DEPTH 4944 HADDUW m0, m1 4945%endif 4946 mova [rsp+48], m0 4947 call pixel_sa8d_8x8_internal2 4948 SA8D_INTER 4949 mova [esp+48], m0 4950 4951 mov r0, [r6+20] 4952 mov r2, [r6+28] 4953 add r0, 8*SIZEOF_PIXEL 4954 add r2, 8*SIZEOF_PIXEL 4955 call pixel_sa8d_8x8_internal2 4956 SA8D_INTER 4957 mova [esp+48], m0 4958 call pixel_sa8d_8x8_internal2 4959 SA8D_INTER 4960%if HIGH_BIT_DEPTH == 0 4961 HADDUW m0, m1 4962%endif 4963 movd r4d, m0 4964 add r4d, 1 4965 shr r4d, 1 4966 mov dword [esp+36], r4d 4967 4968 mov r0, [r6+20] 4969 mov r2, [r6+28] 4970 add r0, 16*SIZEOF_PIXEL 4971 add r2, 16*SIZEOF_PIXEL 4972 lea r4, [r1 + 2*r1] 4973 call pixel_sa8d_8x8_internal2 4974%if HIGH_BIT_DEPTH 4975 HADDUW m0, m1 4976%endif 4977 mova [esp+48], m0 4978 call pixel_sa8d_8x8_internal2 4979 SA8D_INTER 4980 mova [esp+48], m0 4981 4982 mov r0, [r6+20] 4983 mov r2, [r6+28] 4984 add r0, 24*SIZEOF_PIXEL 4985 add r2, 24*SIZEOF_PIXEL 4986 call pixel_sa8d_8x8_internal2 4987 SA8D_INTER 4988 mova [esp+64-mmsize], m0 4989 call pixel_sa8d_8x8_internal2 4990 AVG_16x16 4991 4992 mov r0, [r6+20] 4993 mov r2, [r6+28] 4994 lea r0, [r0 + r1*8] 4995 lea r2, [r2 + r3*8] 4996 lea r0, [r0 + r1*8] 4997 lea r2, [r2 + r3*8] 4998 mov [r6+20], r0 4999 mov [r6+28], r2 5000 5001 lea r4, [r1 + 2*r1] 5002 call pixel_sa8d_8x8_internal2 5003%if HIGH_BIT_DEPTH 5004 HADDUW m0, m1 5005%endif 5006 mova [esp+48], m0 5007 call pixel_sa8d_8x8_internal2 5008 SA8D_INTER 5009 mova [esp+48], m0 5010 5011 mov r0, [r6+20] 5012 mov r2, [r6+28] 5013 add r0, 8*SIZEOF_PIXEL 5014 add r2, 8*SIZEOF_PIXEL 5015 call pixel_sa8d_8x8_internal2 5016 SA8D_INTER 5017 mova [esp+64-mmsize], m0 5018 call pixel_sa8d_8x8_internal2 5019 AVG_16x16 5020 5021 mov r0, [r6+20] 5022 mov r2, [r6+28] 5023 add r0, 16*SIZEOF_PIXEL 5024 add r2, 16*SIZEOF_PIXEL 5025 lea r4, [r1 + 2*r1] 5026 call pixel_sa8d_8x8_internal2 5027%if HIGH_BIT_DEPTH 5028 HADDUW m0, m1 5029%endif 5030 mova [esp+48], m0 5031 call pixel_sa8d_8x8_internal2 5032 SA8D_INTER 5033 mova [esp+48], m0 5034 5035 mov r0, [r6+20] 5036 mov r2, [r6+28] 5037 add r0, 24*SIZEOF_PIXEL 5038 add r2, 24*SIZEOF_PIXEL 5039 call pixel_sa8d_8x8_internal2 5040 SA8D_INTER 5041 mova [esp+64-mmsize], m0 5042 call pixel_sa8d_8x8_internal2 5043 AVG_16x16 5044 5045 mov r0, [r6+20] 5046 mov r2, [r6+28] 5047 lea r0, [r0 + r1*8] 5048 lea r2, [r2 + r3*8] 5049 lea r0, [r0 + r1*8] 5050 lea r2, [r2 + r3*8] 5051 mov [r6+20], r0 5052 mov [r6+28], r2 5053 5054 lea r4, [r1 + 2*r1] 5055 call pixel_sa8d_8x8_internal2 5056%if HIGH_BIT_DEPTH 5057 HADDUW m0, m1 5058%endif 5059 mova [esp+48], m0 5060 call pixel_sa8d_8x8_internal2 5061 SA8D_INTER 5062 mova [esp+48], m0 5063 5064 mov r0, [r6+20] 5065 mov r2, [r6+28] 5066 add r0, 8*SIZEOF_PIXEL 5067 add r2, 8*SIZEOF_PIXEL 5068 call pixel_sa8d_8x8_internal2 5069 SA8D_INTER 5070 mova [esp+64-mmsize], m0 5071 call pixel_sa8d_8x8_internal2 5072 AVG_16x16 5073 5074 mov r0, [r6+20] 5075 mov r2, [r6+28] 5076 add r0, 16*SIZEOF_PIXEL 5077 add r2, 16*SIZEOF_PIXEL 5078 lea r4, [r1 + 2*r1] 5079 call pixel_sa8d_8x8_internal2 5080%if HIGH_BIT_DEPTH 5081 HADDUW m0, m1 5082%endif 5083 mova [esp+48], m0 5084 call pixel_sa8d_8x8_internal2 5085 SA8D_INTER 5086 mova [esp+48], m0 5087 5088 mov r0, [r6+20] 5089 mov r2, [r6+28] 5090 add r0, 24*SIZEOF_PIXEL 5091 add r2, 24*SIZEOF_PIXEL 5092 call pixel_sa8d_8x8_internal2 5093 SA8D_INTER 5094 mova [esp+64-mmsize], m0 5095 call pixel_sa8d_8x8_internal2 5096 AVG_16x16 5097 5098 mov r0, [r6+20] 5099 mov r2, [r6+28] 5100 lea r0, [r0 + r1*8] 5101 lea r2, [r2 + r3*8] 5102 lea r0, [r0 + r1*8] 5103 lea r2, [r2 + r3*8] 5104 mov [r6+20], r0 5105 mov [r6+28], r2 5106 5107 lea r4, [r1 + 2*r1] 5108 call pixel_sa8d_8x8_internal2 5109%if HIGH_BIT_DEPTH 5110 HADDUW m0, m1 5111%endif 5112 mova [esp+48], m0 5113 call pixel_sa8d_8x8_internal2 5114 SA8D_INTER 5115 mova [esp+48], m0 5116 5117 mov r0, [r6+20] 5118 mov r2, [r6+28] 5119 add r0, 8*SIZEOF_PIXEL 5120 add r2, 8*SIZEOF_PIXEL 5121 call pixel_sa8d_8x8_internal2 5122 SA8D_INTER 5123 mova [esp+64-mmsize], m0 5124 call pixel_sa8d_8x8_internal2 5125 AVG_16x16 5126 5127 mov r0, [r6+20] 5128 mov r2, [r6+28] 5129 add r0, 16*SIZEOF_PIXEL 5130 add r2, 16*SIZEOF_PIXEL 5131 lea r4, [r1 + 2*r1] 5132 call pixel_sa8d_8x8_internal2 5133%if HIGH_BIT_DEPTH 5134 HADDUW m0, m1 5135%endif 5136 mova [esp+48], m0 5137 call pixel_sa8d_8x8_internal2 5138 SA8D_INTER 5139 mova [esp+48], m0 5140 5141 mov r0, [r6+20] 5142 mov r2, [r6+28] 5143 add r0, 24*SIZEOF_PIXEL 5144 add r2, 24*SIZEOF_PIXEL 5145 call pixel_sa8d_8x8_internal2 5146 SA8D_INTER 5147 mova [esp+64-mmsize], m0 5148 call pixel_sa8d_8x8_internal2 5149 SA8D_INTER 5150%if HIGH_BIT_DEPTH == 0 5151 HADDUW m0, m1 5152%endif 5153 movd r4d, m0 5154 add r4d, 1 5155 shr r4d, 1 5156 add r4d, dword [esp+36] 5157 mov eax, r4d 5158 mov esp, r6 5159 RET 5160 5161cglobal pixel_sa8d_48x64, 4,7,8 5162 FIX_STRIDES r1, r3 5163 mov r6, esp 5164 and esp, ~15 5165 sub esp, 64 5166 5167 lea r4, [r1 + 2*r1] 5168 lea r5, [r3 + 2*r3] 5169 call pixel_sa8d_8x8_internal2 5170%if HIGH_BIT_DEPTH 5171 HADDUW m0, m1 5172%endif 5173 mova [rsp+48], m0 5174 call pixel_sa8d_8x8_internal2 5175 SA8D_INTER 5176 mova [esp+48], m0 5177 5178 mov r0, [r6+20] 5179 mov r2, [r6+28] 5180 add r0, 8*SIZEOF_PIXEL 5181 add r2, 8*SIZEOF_PIXEL 5182 call pixel_sa8d_8x8_internal2 5183 SA8D_INTER 5184 mova [esp+48], m0 5185 call pixel_sa8d_8x8_internal2 5186 SA8D_INTER 5187%if HIGH_BIT_DEPTH == 0 5188 HADDUW m0, m1 5189%endif 5190 movd r4d, m0 5191 add r4d, 1 5192 shr r4d, 1 5193 mov dword [esp+36], r4d 5194 5195 mov r0, [r6+20] 5196 mov r2, [r6+28] 5197 add r0, 16*SIZEOF_PIXEL 5198 add r2, 16*SIZEOF_PIXEL 5199 lea r4, [r1 + 2*r1] 5200 call pixel_sa8d_8x8_internal2 5201%if HIGH_BIT_DEPTH 5202 HADDUW m0, m1 5203%endif 5204 mova [esp+48], m0 5205 call pixel_sa8d_8x8_internal2 5206 SA8D_INTER 5207 mova [esp+48], m0 5208 5209 mov r0, [r6+20] 5210 mov r2, [r6+28] 5211 add r0, 24*SIZEOF_PIXEL 5212 add r2, 24*SIZEOF_PIXEL 5213 call pixel_sa8d_8x8_internal2 5214 SA8D_INTER 5215 mova [esp+64-mmsize], m0 5216 call pixel_sa8d_8x8_internal2 5217 AVG_16x16 5218 5219 mov r0, [r6+20] 5220 mov r2, [r6+28] 5221 add r0, 32*SIZEOF_PIXEL 5222 add r2, 32*SIZEOF_PIXEL 5223 lea r4, [r1 + 2*r1] 5224 call pixel_sa8d_8x8_internal2 5225%if HIGH_BIT_DEPTH 5226 HADDUW m0, m1 5227%endif 5228 mova [esp+48], m0 5229 call pixel_sa8d_8x8_internal2 5230 SA8D_INTER 5231 mova [esp+48], m0 5232 5233 mov r0, [r6+20] 5234 mov r2, [r6+28] 5235 add r0, 40*SIZEOF_PIXEL 5236 add r2, 40*SIZEOF_PIXEL 5237 call pixel_sa8d_8x8_internal2 5238 SA8D_INTER 5239 mova [esp+64-mmsize], m0 5240 call pixel_sa8d_8x8_internal2 5241 AVG_16x16 5242 5243 mov r0, [r6+20] 5244 mov r2, [r6+28] 5245 lea r0, [r0 + r1*8] 5246 lea r2, [r2 + r3*8] 5247 lea r0, [r0 + r1*8] 5248 lea r2, [r2 + r3*8] 5249 mov [r6+20], r0 5250 mov [r6+28], r2 5251 5252 lea r4, [r1 + 2*r1] 5253 call pixel_sa8d_8x8_internal2 5254%if HIGH_BIT_DEPTH 5255 HADDUW m0, m1 5256%endif 5257 mova [esp+48], m0 5258 call pixel_sa8d_8x8_internal2 5259 SA8D_INTER 5260 mova [esp+48], m0 5261 5262 mov r0, [r6+20] 5263 mov r2, [r6+28] 5264 add r0, 8*SIZEOF_PIXEL 5265 add r2, 8*SIZEOF_PIXEL 5266 call pixel_sa8d_8x8_internal2 5267 SA8D_INTER 5268 mova [esp+64-mmsize], m0 5269 call pixel_sa8d_8x8_internal2 5270 AVG_16x16 5271 5272 mov r0, [r6+20] 5273 mov r2, [r6+28] 5274 add r0, 16*SIZEOF_PIXEL 5275 add r2, 16*SIZEOF_PIXEL 5276 lea r4, [r1 + 2*r1] 5277 call pixel_sa8d_8x8_internal2 5278%if HIGH_BIT_DEPTH 5279 HADDUW m0, m1 5280%endif 5281 mova [esp+48], m0 5282 call pixel_sa8d_8x8_internal2 5283 SA8D_INTER 5284 mova [esp+48], m0 5285 5286 mov r0, [r6+20] 5287 mov r2, [r6+28] 5288 add r0, 24*SIZEOF_PIXEL 5289 add r2, 24*SIZEOF_PIXEL 5290 call pixel_sa8d_8x8_internal2 5291 SA8D_INTER 5292 mova [esp+64-mmsize], m0 5293 call pixel_sa8d_8x8_internal2 5294 AVG_16x16 5295 5296 mov r0, [r6+20] 5297 mov r2, [r6+28] 5298 add r0, 32*SIZEOF_PIXEL 5299 add r2, 32*SIZEOF_PIXEL 5300 lea r4, [r1 + 2*r1] 5301 call pixel_sa8d_8x8_internal2 5302%if HIGH_BIT_DEPTH 5303 HADDUW m0, m1 5304%endif 5305 mova [esp+48], m0 5306 call pixel_sa8d_8x8_internal2 5307 SA8D_INTER 5308 mova [esp+48], m0 5309 5310 mov r0, [r6+20] 5311 mov r2, [r6+28] 5312 add r0, 40*SIZEOF_PIXEL 5313 add r2, 40*SIZEOF_PIXEL 5314 call pixel_sa8d_8x8_internal2 5315 SA8D_INTER 5316 mova [esp+64-mmsize], m0 5317 call pixel_sa8d_8x8_internal2 5318 AVG_16x16 5319 5320 mov r0, [r6+20] 5321 mov r2, [r6+28] 5322 lea r0, [r0 + r1*8] 5323 lea r2, [r2 + r3*8] 5324 lea r0, [r0 + r1*8] 5325 lea r2, [r2 + r3*8] 5326 mov [r6+20], r0 5327 mov [r6+28], r2 5328 5329 lea r4, [r1 + 2*r1] 5330 call pixel_sa8d_8x8_internal2 5331%if HIGH_BIT_DEPTH 5332 HADDUW m0, m1 5333%endif 5334 mova [esp+48], m0 5335 call pixel_sa8d_8x8_internal2 5336 SA8D_INTER 5337 mova [esp+48], m0 5338 5339 mov r0, [r6+20] 5340 mov r2, [r6+28] 5341 add r0, 8*SIZEOF_PIXEL 5342 add r2, 8*SIZEOF_PIXEL 5343 call pixel_sa8d_8x8_internal2 5344 SA8D_INTER 5345 mova [esp+64-mmsize], m0 5346 call pixel_sa8d_8x8_internal2 5347 AVG_16x16 5348 5349 mov r0, [r6+20] 5350 mov r2, [r6+28] 5351 add r0, 16*SIZEOF_PIXEL 5352 add r2, 16*SIZEOF_PIXEL 5353 lea r4, [r1 + 2*r1] 5354 call pixel_sa8d_8x8_internal2 5355%if HIGH_BIT_DEPTH 5356 HADDUW m0, m1 5357%endif 5358 mova [esp+48], m0 5359 call pixel_sa8d_8x8_internal2 5360 SA8D_INTER 5361 mova [esp+48], m0 5362 5363 mov r0, [r6+20] 5364 mov r2, [r6+28] 5365 add r0, 24*SIZEOF_PIXEL 5366 add r2, 24*SIZEOF_PIXEL 5367 call pixel_sa8d_8x8_internal2 5368 SA8D_INTER 5369 mova [esp+64-mmsize], m0 5370 call pixel_sa8d_8x8_internal2 5371 AVG_16x16 5372 5373 mov r0, [r6+20] 5374 mov r2, [r6+28] 5375 add r0, 32*SIZEOF_PIXEL 5376 add r2, 32*SIZEOF_PIXEL 5377 lea r4, [r1 + 2*r1] 5378 call pixel_sa8d_8x8_internal2 5379%if HIGH_BIT_DEPTH 5380 HADDUW m0, m1 5381%endif 5382 mova [esp+48], m0 5383 call pixel_sa8d_8x8_internal2 5384 SA8D_INTER 5385 mova [esp+48], m0 5386 5387 mov r0, [r6+20] 5388 mov r2, [r6+28] 5389 add r0, 40*SIZEOF_PIXEL 5390 add r2, 40*SIZEOF_PIXEL 5391 call pixel_sa8d_8x8_internal2 5392 SA8D_INTER 5393 mova [esp+64-mmsize], m0 5394 call pixel_sa8d_8x8_internal2 5395 AVG_16x16 5396 5397 mov r0, [r6+20] 5398 mov r2, [r6+28] 5399 lea r0, [r0 + r1*8] 5400 lea r2, [r2 + r3*8] 5401 lea r0, [r0 + r1*8] 5402 lea r2, [r2 + r3*8] 5403 mov [r6+20], r0 5404 mov [r6+28], r2 5405 5406 lea r4, [r1 + 2*r1] 5407 call pixel_sa8d_8x8_internal2 5408%if HIGH_BIT_DEPTH 5409 HADDUW m0, m1 5410%endif 5411 mova [esp+48], m0 5412 call pixel_sa8d_8x8_internal2 5413 SA8D_INTER 5414 mova [esp+48], m0 5415 5416 mov r0, [r6+20] 5417 mov r2, [r6+28] 5418 add r0, 8*SIZEOF_PIXEL 5419 add r2, 8*SIZEOF_PIXEL 5420 call pixel_sa8d_8x8_internal2 5421 SA8D_INTER 5422 mova [esp+64-mmsize], m0 5423 call pixel_sa8d_8x8_internal2 5424 AVG_16x16 5425 5426 mov r0, [r6+20] 5427 mov r2, [r6+28] 5428 add r0, 16*SIZEOF_PIXEL 5429 add r2, 16*SIZEOF_PIXEL 5430 lea r4, [r1 + 2*r1] 5431 call pixel_sa8d_8x8_internal2 5432%if HIGH_BIT_DEPTH 5433 HADDUW m0, m1 5434%endif 5435 mova [esp+48], m0 5436 call pixel_sa8d_8x8_internal2 5437 SA8D_INTER 5438 mova [esp+48], m0 5439 5440 mov r0, [r6+20] 5441 mov r2, [r6+28] 5442 add r0, 24*SIZEOF_PIXEL 5443 add r2, 24*SIZEOF_PIXEL 5444 call pixel_sa8d_8x8_internal2 5445 SA8D_INTER 5446 mova [esp+64-mmsize], m0 5447 call pixel_sa8d_8x8_internal2 5448 AVG_16x16 5449 5450 mov r0, [r6+20] 5451 mov r2, [r6+28] 5452 add r0, 32*SIZEOF_PIXEL 5453 add r2, 32*SIZEOF_PIXEL 5454 lea r4, [r1 + 2*r1] 5455 call pixel_sa8d_8x8_internal2 5456%if HIGH_BIT_DEPTH 5457 HADDUW m0, m1 5458%endif 5459 mova [esp+48], m0 5460 call pixel_sa8d_8x8_internal2 5461 SA8D_INTER 5462 mova [esp+48], m0 5463 5464 mov r0, [r6+20] 5465 mov r2, [r6+28] 5466 add r0, 40*SIZEOF_PIXEL 5467 add r2, 40*SIZEOF_PIXEL 5468 call pixel_sa8d_8x8_internal2 5469 SA8D_INTER 5470 mova [esp+64-mmsize], m0 5471 call pixel_sa8d_8x8_internal2 5472 SA8D_INTER 5473%if HIGH_BIT_DEPTH == 0 5474 HADDUW m0, m1 5475%endif 5476 movd r4d, m0 5477 add r4d, 1 5478 shr r4d, 1 5479 add r4d, dword [esp+36] 5480 mov eax, r4d 5481 mov esp, r6 5482 RET 5483 5484cglobal pixel_sa8d_64x16, 4,7,8 5485 FIX_STRIDES r1, r3 5486 mov r6, esp 5487 and esp, ~15 5488 sub esp, 64 5489 5490 lea r4, [r1 + 2*r1] 5491 lea r5, [r3 + 2*r3] 5492 call pixel_sa8d_8x8_internal2 5493%if HIGH_BIT_DEPTH 5494 HADDUW m0, m1 5495%endif 5496 mova [rsp+48], m0 5497 call pixel_sa8d_8x8_internal2 5498 SA8D_INTER 5499 mova [esp+48], m0 5500 5501 mov r0, [r6+20] 5502 mov r2, [r6+28] 5503 add r0, 8*SIZEOF_PIXEL 5504 add r2, 8*SIZEOF_PIXEL 5505 call pixel_sa8d_8x8_internal2 5506 SA8D_INTER 5507 mova [esp+48], m0 5508 call pixel_sa8d_8x8_internal2 5509 SA8D_INTER 5510%if HIGH_BIT_DEPTH == 0 5511 HADDUW m0, m1 5512%endif 5513 movd r4d, m0 5514 add r4d, 1 5515 shr r4d, 1 5516 mov dword [esp+36], r4d 5517 5518 mov r0, [r6+20] 5519 mov r2, [r6+28] 5520 add r0, 16*SIZEOF_PIXEL 5521 add r2, 16*SIZEOF_PIXEL 5522 lea r4, [r1 + 2*r1] 5523 call pixel_sa8d_8x8_internal2 5524%if HIGH_BIT_DEPTH 5525 HADDUW m0, m1 5526%endif 5527 mova [esp+48], m0 5528 call pixel_sa8d_8x8_internal2 5529 SA8D_INTER 5530 mova [esp+48], m0 5531 5532 mov r0, [r6+20] 5533 mov r2, [r6+28] 5534 add r0, 24*SIZEOF_PIXEL 5535 add r2, 24*SIZEOF_PIXEL 5536 call pixel_sa8d_8x8_internal2 5537 SA8D_INTER 5538 mova [esp+64-mmsize], m0 5539 call pixel_sa8d_8x8_internal2 5540 AVG_16x16 5541 5542 mov r0, [r6+20] 5543 mov r2, [r6+28] 5544 add r0, 32*SIZEOF_PIXEL 5545 add r2, 32*SIZEOF_PIXEL 5546 lea r4, [r1 + 2*r1] 5547 call pixel_sa8d_8x8_internal2 5548%if HIGH_BIT_DEPTH 5549 HADDUW m0, m1 5550%endif 5551 mova [esp+48], m0 5552 call pixel_sa8d_8x8_internal2 5553 SA8D_INTER 5554 mova [esp+48], m0 5555 5556 mov r0, [r6+20] 5557 mov r2, [r6+28] 5558 add r0, 40*SIZEOF_PIXEL 5559 add r2, 40*SIZEOF_PIXEL 5560 call pixel_sa8d_8x8_internal2 5561 SA8D_INTER 5562 mova [esp+64-mmsize], m0 5563 call pixel_sa8d_8x8_internal2 5564 AVG_16x16 5565 5566 mov r0, [r6+20] 5567 mov r2, [r6+28] 5568 add r0, 48*SIZEOF_PIXEL 5569 add r2, 48*SIZEOF_PIXEL 5570 lea r4, [r1 + 2*r1] 5571 call pixel_sa8d_8x8_internal2 5572%if HIGH_BIT_DEPTH 5573 HADDUW m0, m1 5574%endif 5575 mova [esp+48], m0 5576 call pixel_sa8d_8x8_internal2 5577 SA8D_INTER 5578 mova [esp+48], m0 5579 5580 mov r0, [r6+20] 5581 mov r2, [r6+28] 5582 add r0, 56*SIZEOF_PIXEL 5583 add r2, 56*SIZEOF_PIXEL 5584 call pixel_sa8d_8x8_internal2 5585 SA8D_INTER 5586 mova [esp+64-mmsize], m0 5587 call pixel_sa8d_8x8_internal2 5588 SA8D_INTER 5589%if HIGH_BIT_DEPTH == 0 5590 HADDUW m0, m1 5591%endif 5592 movd r4d, m0 5593 add r4d, 1 5594 shr r4d, 1 5595 add r4d, dword [esp+36] 5596 mov eax, r4d 5597 mov esp, r6 5598 RET 5599 5600cglobal pixel_sa8d_64x32, 4,7,8 5601 FIX_STRIDES r1, r3 5602 mov r6, esp 5603 and esp, ~15 5604 sub esp, 64 5605 5606 lea r4, [r1 + 2*r1] 5607 lea r5, [r3 + 2*r3] 5608 call pixel_sa8d_8x8_internal2 5609%if HIGH_BIT_DEPTH 5610 HADDUW m0, m1 5611%endif 5612 mova [rsp+48], m0 5613 call pixel_sa8d_8x8_internal2 5614 SA8D_INTER 5615 mova [esp+48], m0 5616 5617 mov r0, [r6+20] 5618 mov r2, [r6+28] 5619 add r0, 8*SIZEOF_PIXEL 5620 add r2, 8*SIZEOF_PIXEL 5621 call pixel_sa8d_8x8_internal2 5622 SA8D_INTER 5623 mova [esp+48], m0 5624 call pixel_sa8d_8x8_internal2 5625 SA8D_INTER 5626%if HIGH_BIT_DEPTH == 0 5627 HADDUW m0, m1 5628%endif 5629 movd r4d, m0 5630 add r4d, 1 5631 shr r4d, 1 5632 mov dword [esp+36], r4d 5633 5634 mov r0, [r6+20] 5635 mov r2, [r6+28] 5636 add r0, 16*SIZEOF_PIXEL 5637 add r2, 16*SIZEOF_PIXEL 5638 lea r4, [r1 + 2*r1] 5639 call pixel_sa8d_8x8_internal2 5640%if HIGH_BIT_DEPTH 5641 HADDUW m0, m1 5642%endif 5643 mova [esp+48], m0 5644 call pixel_sa8d_8x8_internal2 5645 SA8D_INTER 5646 mova [esp+48], m0 5647 5648 mov r0, [r6+20] 5649 mov r2, [r6+28] 5650 add r0, 24*SIZEOF_PIXEL 5651 add r2, 24*SIZEOF_PIXEL 5652 call pixel_sa8d_8x8_internal2 5653 SA8D_INTER 5654 mova [esp+64-mmsize], m0 5655 call pixel_sa8d_8x8_internal2 5656 AVG_16x16 5657 5658 mov r0, [r6+20] 5659 mov r2, [r6+28] 5660 add r0, 32*SIZEOF_PIXEL 5661 add r2, 32*SIZEOF_PIXEL 5662 lea r4, [r1 + 2*r1] 5663 call pixel_sa8d_8x8_internal2 5664%if HIGH_BIT_DEPTH 5665 HADDUW m0, m1 5666%endif 5667 mova [esp+48], m0 5668 call pixel_sa8d_8x8_internal2 5669 SA8D_INTER 5670 mova [esp+48], m0 5671 5672 mov r0, [r6+20] 5673 mov r2, [r6+28] 5674 add r0, 40*SIZEOF_PIXEL 5675 add r2, 40*SIZEOF_PIXEL 5676 call pixel_sa8d_8x8_internal2 5677 SA8D_INTER 5678 mova [esp+64-mmsize], m0 5679 call pixel_sa8d_8x8_internal2 5680 AVG_16x16 5681 5682 mov r0, [r6+20] 5683 mov r2, [r6+28] 5684 add r0, 48*SIZEOF_PIXEL 5685 add r2, 48*SIZEOF_PIXEL 5686 lea r4, [r1 + 2*r1] 5687 call pixel_sa8d_8x8_internal2 5688%if HIGH_BIT_DEPTH 5689 HADDUW m0, m1 5690%endif 5691 mova [esp+48], m0 5692 call pixel_sa8d_8x8_internal2 5693 SA8D_INTER 5694 mova [esp+48], m0 5695 5696 mov r0, [r6+20] 5697 mov r2, [r6+28] 5698 add r0, 56*SIZEOF_PIXEL 5699 add r2, 56*SIZEOF_PIXEL 5700 call pixel_sa8d_8x8_internal2 5701 SA8D_INTER 5702 mova [esp+64-mmsize], m0 5703 call pixel_sa8d_8x8_internal2 5704 AVG_16x16 5705 5706 mov r0, [r6+20] 5707 mov r2, [r6+28] 5708 lea r0, [r0 + r1*8] 5709 lea r2, [r2 + r3*8] 5710 lea r0, [r0 + r1*8] 5711 lea r2, [r2 + r3*8] 5712 mov [r6+20], r0 5713 mov [r6+28], r2 5714 5715 lea r4, [r1 + 2*r1] 5716 call pixel_sa8d_8x8_internal2 5717%if HIGH_BIT_DEPTH 5718 HADDUW m0, m1 5719%endif 5720 mova [esp+48], m0 5721 call pixel_sa8d_8x8_internal2 5722 SA8D_INTER 5723 mova [esp+48], m0 5724 5725 mov r0, [r6+20] 5726 mov r2, [r6+28] 5727 add r0, 8*SIZEOF_PIXEL 5728 add r2, 8*SIZEOF_PIXEL 5729 call pixel_sa8d_8x8_internal2 5730 SA8D_INTER 5731 mova [esp+64-mmsize], m0 5732 call pixel_sa8d_8x8_internal2 5733 AVG_16x16 5734 5735 mov r0, [r6+20] 5736 mov r2, [r6+28] 5737 add r0, 16*SIZEOF_PIXEL 5738 add r2, 16*SIZEOF_PIXEL 5739 lea r4, [r1 + 2*r1] 5740 call pixel_sa8d_8x8_internal2 5741%if HIGH_BIT_DEPTH 5742 HADDUW m0, m1 5743%endif 5744 mova [esp+48], m0 5745 call pixel_sa8d_8x8_internal2 5746 SA8D_INTER 5747 mova [esp+48], m0 5748 5749 mov r0, [r6+20] 5750 mov r2, [r6+28] 5751 add r0, 24*SIZEOF_PIXEL 5752 add r2, 24*SIZEOF_PIXEL 5753 call pixel_sa8d_8x8_internal2 5754 SA8D_INTER 5755 mova [esp+64-mmsize], m0 5756 call pixel_sa8d_8x8_internal2 5757 AVG_16x16 5758 5759 mov r0, [r6+20] 5760 mov r2, [r6+28] 5761 add r0, 32*SIZEOF_PIXEL 5762 add r2, 32*SIZEOF_PIXEL 5763 lea r4, [r1 + 2*r1] 5764 call pixel_sa8d_8x8_internal2 5765%if HIGH_BIT_DEPTH 5766 HADDUW m0, m1 5767%endif 5768 mova [esp+48], m0 5769 call pixel_sa8d_8x8_internal2 5770 SA8D_INTER 5771 mova [esp+48], m0 5772 5773 mov r0, [r6+20] 5774 mov r2, [r6+28] 5775 add r0, 40*SIZEOF_PIXEL 5776 add r2, 40*SIZEOF_PIXEL 5777 call pixel_sa8d_8x8_internal2 5778 SA8D_INTER 5779 mova [esp+64-mmsize], m0 5780 call pixel_sa8d_8x8_internal2 5781 AVG_16x16 5782 5783 mov r0, [r6+20] 5784 mov r2, [r6+28] 5785 add r0, 48*SIZEOF_PIXEL 5786 add r2, 48*SIZEOF_PIXEL 5787 lea r4, [r1 + 2*r1] 5788 call pixel_sa8d_8x8_internal2 5789%if HIGH_BIT_DEPTH 5790 HADDUW m0, m1 5791%endif 5792 mova [esp+48], m0 5793 call pixel_sa8d_8x8_internal2 5794 SA8D_INTER 5795 mova [esp+48], m0 5796 5797 mov r0, [r6+20] 5798 mov r2, [r6+28] 5799 add r0, 56*SIZEOF_PIXEL 5800 add r2, 56*SIZEOF_PIXEL 5801 call pixel_sa8d_8x8_internal2 5802 SA8D_INTER 5803 mova [esp+64-mmsize], m0 5804 call pixel_sa8d_8x8_internal2 5805 SA8D_INTER 5806%if HIGH_BIT_DEPTH == 0 5807 HADDUW m0, m1 5808%endif 5809 movd r4d, m0 5810 add r4d, 1 5811 shr r4d, 1 5812 add r4d, dword [esp+36] 5813 mov eax, r4d 5814 mov esp, r6 5815 RET 5816 5817cglobal pixel_sa8d_64x48, 4,7,8 5818 FIX_STRIDES r1, r3 5819 mov r6, esp 5820 and esp, ~15 5821 sub esp, 64 5822 5823 lea r4, [r1 + 2*r1] 5824 lea r5, [r3 + 2*r3] 5825 call pixel_sa8d_8x8_internal2 5826%if HIGH_BIT_DEPTH 5827 HADDUW m0, m1 5828%endif 5829 mova [rsp+48], m0 5830 call pixel_sa8d_8x8_internal2 5831 SA8D_INTER 5832 mova [esp+48], m0 5833 5834 mov r0, [r6+20] 5835 mov r2, [r6+28] 5836 add r0, 8*SIZEOF_PIXEL 5837 add r2, 8*SIZEOF_PIXEL 5838 call pixel_sa8d_8x8_internal2 5839 SA8D_INTER 5840 mova [esp+48], m0 5841 call pixel_sa8d_8x8_internal2 5842 SA8D_INTER 5843%if HIGH_BIT_DEPTH == 0 5844 HADDUW m0, m1 5845%endif 5846 movd r4d, m0 5847 add r4d, 1 5848 shr r4d, 1 5849 mov dword [esp+36], r4d 5850 5851 mov r0, [r6+20] 5852 mov r2, [r6+28] 5853 add r0, 16*SIZEOF_PIXEL 5854 add r2, 16*SIZEOF_PIXEL 5855 lea r4, [r1 + 2*r1] 5856 call pixel_sa8d_8x8_internal2 5857%if HIGH_BIT_DEPTH 5858 HADDUW m0, m1 5859%endif 5860 mova [esp+48], m0 5861 call pixel_sa8d_8x8_internal2 5862 SA8D_INTER 5863 mova [esp+48], m0 5864 5865 mov r0, [r6+20] 5866 mov r2, [r6+28] 5867 add r0, 24*SIZEOF_PIXEL 5868 add r2, 24*SIZEOF_PIXEL 5869 call pixel_sa8d_8x8_internal2 5870 SA8D_INTER 5871 mova [esp+64-mmsize], m0 5872 call pixel_sa8d_8x8_internal2 5873 AVG_16x16 5874 5875 mov r0, [r6+20] 5876 mov r2, [r6+28] 5877 add r0, 32*SIZEOF_PIXEL 5878 add r2, 32*SIZEOF_PIXEL 5879 lea r4, [r1 + 2*r1] 5880 call pixel_sa8d_8x8_internal2 5881%if HIGH_BIT_DEPTH 5882 HADDUW m0, m1 5883%endif 5884 mova [esp+48], m0 5885 call pixel_sa8d_8x8_internal2 5886 SA8D_INTER 5887 mova [esp+48], m0 5888 5889 mov r0, [r6+20] 5890 mov r2, [r6+28] 5891 add r0, 40*SIZEOF_PIXEL 5892 add r2, 40*SIZEOF_PIXEL 5893 call pixel_sa8d_8x8_internal2 5894 SA8D_INTER 5895 mova [esp+64-mmsize], m0 5896 call pixel_sa8d_8x8_internal2 5897 AVG_16x16 5898 5899 mov r0, [r6+20] 5900 mov r2, [r6+28] 5901 add r0, 48*SIZEOF_PIXEL 5902 add r2, 48*SIZEOF_PIXEL 5903 lea r4, [r1 + 2*r1] 5904 call pixel_sa8d_8x8_internal2 5905%if HIGH_BIT_DEPTH 5906 HADDUW m0, m1 5907%endif 5908 mova [esp+48], m0 5909 call pixel_sa8d_8x8_internal2 5910 SA8D_INTER 5911 mova [esp+48], m0 5912 5913 mov r0, [r6+20] 5914 mov r2, [r6+28] 5915 add r0, 56*SIZEOF_PIXEL 5916 add r2, 56*SIZEOF_PIXEL 5917 call pixel_sa8d_8x8_internal2 5918 SA8D_INTER 5919 mova [esp+64-mmsize], m0 5920 call pixel_sa8d_8x8_internal2 5921 AVG_16x16 5922 5923 mov r0, [r6+20] 5924 mov r2, [r6+28] 5925 lea r0, [r0 + r1*8] 5926 lea r2, [r2 + r3*8] 5927 lea r0, [r0 + r1*8] 5928 lea r2, [r2 + r3*8] 5929 mov [r6+20], r0 5930 mov [r6+28], r2 5931 5932 lea r4, [r1 + 2*r1] 5933 call pixel_sa8d_8x8_internal2 5934%if HIGH_BIT_DEPTH 5935 HADDUW m0, m1 5936%endif 5937 mova [esp+48], m0 5938 call pixel_sa8d_8x8_internal2 5939 SA8D_INTER 5940 mova [esp+48], m0 5941 5942 mov r0, [r6+20] 5943 mov r2, [r6+28] 5944 add r0, 8*SIZEOF_PIXEL 5945 add r2, 8*SIZEOF_PIXEL 5946 call pixel_sa8d_8x8_internal2 5947 SA8D_INTER 5948 mova [esp+64-mmsize], m0 5949 call pixel_sa8d_8x8_internal2 5950 AVG_16x16 5951 5952 mov r0, [r6+20] 5953 mov r2, [r6+28] 5954 add r0, 16*SIZEOF_PIXEL 5955 add r2, 16*SIZEOF_PIXEL 5956 lea r4, [r1 + 2*r1] 5957 call pixel_sa8d_8x8_internal2 5958%if HIGH_BIT_DEPTH 5959 HADDUW m0, m1 5960%endif 5961 mova [esp+48], m0 5962 call pixel_sa8d_8x8_internal2 5963 SA8D_INTER 5964 mova [esp+48], m0 5965 5966 mov r0, [r6+20] 5967 mov r2, [r6+28] 5968 add r0, 24*SIZEOF_PIXEL 5969 add r2, 24*SIZEOF_PIXEL 5970 call pixel_sa8d_8x8_internal2 5971 SA8D_INTER 5972 mova [esp+64-mmsize], m0 5973 call pixel_sa8d_8x8_internal2 5974 AVG_16x16 5975 5976 mov r0, [r6+20] 5977 mov r2, [r6+28] 5978 add r0, 32*SIZEOF_PIXEL 5979 add r2, 32*SIZEOF_PIXEL 5980 lea r4, [r1 + 2*r1] 5981 call pixel_sa8d_8x8_internal2 5982%if HIGH_BIT_DEPTH 5983 HADDUW m0, m1 5984%endif 5985 mova [esp+48], m0 5986 call pixel_sa8d_8x8_internal2 5987 SA8D_INTER 5988 mova [esp+48], m0 5989 5990 mov r0, [r6+20] 5991 mov r2, [r6+28] 5992 add r0, 40*SIZEOF_PIXEL 5993 add r2, 40*SIZEOF_PIXEL 5994 call pixel_sa8d_8x8_internal2 5995 SA8D_INTER 5996 mova [esp+64-mmsize], m0 5997 call pixel_sa8d_8x8_internal2 5998 AVG_16x16 5999 6000 mov r0, [r6+20] 6001 mov r2, [r6+28] 6002 add r0, 48*SIZEOF_PIXEL 6003 add r2, 48*SIZEOF_PIXEL 6004 lea r4, [r1 + 2*r1] 6005 call pixel_sa8d_8x8_internal2 6006%if HIGH_BIT_DEPTH 6007 HADDUW m0, m1 6008%endif 6009 mova [esp+48], m0 6010 call pixel_sa8d_8x8_internal2 6011 SA8D_INTER 6012 mova [esp+48], m0 6013 6014 mov r0, [r6+20] 6015 mov r2, [r6+28] 6016 add r0, 56*SIZEOF_PIXEL 6017 add r2, 56*SIZEOF_PIXEL 6018 call pixel_sa8d_8x8_internal2 6019 SA8D_INTER 6020 mova [esp+64-mmsize], m0 6021 call pixel_sa8d_8x8_internal2 6022 AVG_16x16 6023 6024 mov r0, [r6+20] 6025 mov r2, [r6+28] 6026 lea r0, [r0 + r1*8] 6027 lea r2, [r2 + r3*8] 6028 lea r0, [r0 + r1*8] 6029 lea r2, [r2 + r3*8] 6030 mov [r6+20], r0 6031 mov [r6+28], r2 6032 6033 lea r4, [r1 + 2*r1] 6034 call pixel_sa8d_8x8_internal2 6035%if HIGH_BIT_DEPTH 6036 HADDUW m0, m1 6037%endif 6038 mova [esp+48], m0 6039 call pixel_sa8d_8x8_internal2 6040 SA8D_INTER 6041 mova [esp+48], m0 6042 6043 mov r0, [r6+20] 6044 mov r2, [r6+28] 6045 add r0, 8*SIZEOF_PIXEL 6046 add r2, 8*SIZEOF_PIXEL 6047 call pixel_sa8d_8x8_internal2 6048 SA8D_INTER 6049 mova [esp+64-mmsize], m0 6050 call pixel_sa8d_8x8_internal2 6051 AVG_16x16 6052 6053 mov r0, [r6+20] 6054 mov r2, [r6+28] 6055 add r0, 16*SIZEOF_PIXEL 6056 add r2, 16*SIZEOF_PIXEL 6057 lea r4, [r1 + 2*r1] 6058 call pixel_sa8d_8x8_internal2 6059%if HIGH_BIT_DEPTH 6060 HADDUW m0, m1 6061%endif 6062 mova [esp+48], m0 6063 call pixel_sa8d_8x8_internal2 6064 SA8D_INTER 6065 mova [esp+48], m0 6066 6067 mov r0, [r6+20] 6068 mov r2, [r6+28] 6069 add r0, 24*SIZEOF_PIXEL 6070 add r2, 24*SIZEOF_PIXEL 6071 call pixel_sa8d_8x8_internal2 6072 SA8D_INTER 6073 mova [esp+64-mmsize], m0 6074 call pixel_sa8d_8x8_internal2 6075 AVG_16x16 6076 6077 mov r0, [r6+20] 6078 mov r2, [r6+28] 6079 add r0, 32*SIZEOF_PIXEL 6080 add r2, 32*SIZEOF_PIXEL 6081 lea r4, [r1 + 2*r1] 6082 call pixel_sa8d_8x8_internal2 6083%if HIGH_BIT_DEPTH 6084 HADDUW m0, m1 6085%endif 6086 mova [esp+48], m0 6087 call pixel_sa8d_8x8_internal2 6088 SA8D_INTER 6089 mova [esp+48], m0 6090 6091 mov r0, [r6+20] 6092 mov r2, [r6+28] 6093 add r0, 40*SIZEOF_PIXEL 6094 add r2, 40*SIZEOF_PIXEL 6095 call pixel_sa8d_8x8_internal2 6096 SA8D_INTER 6097 mova [esp+64-mmsize], m0 6098 call pixel_sa8d_8x8_internal2 6099 AVG_16x16 6100 6101 mov r0, [r6+20] 6102 mov r2, [r6+28] 6103 add r0, 48*SIZEOF_PIXEL 6104 add r2, 48*SIZEOF_PIXEL 6105 lea r4, [r1 + 2*r1] 6106 call pixel_sa8d_8x8_internal2 6107%if HIGH_BIT_DEPTH 6108 HADDUW m0, m1 6109%endif 6110 mova [esp+48], m0 6111 call pixel_sa8d_8x8_internal2 6112 SA8D_INTER 6113 mova [esp+48], m0 6114 6115 mov r0, [r6+20] 6116 mov r2, [r6+28] 6117 add r0, 56*SIZEOF_PIXEL 6118 add r2, 56*SIZEOF_PIXEL 6119 call pixel_sa8d_8x8_internal2 6120 SA8D_INTER 6121 mova [esp+64-mmsize], m0 6122 call pixel_sa8d_8x8_internal2 6123 SA8D_INTER 6124%if HIGH_BIT_DEPTH == 0 6125 HADDUW m0, m1 6126%endif 6127 movd r4d, m0 6128 add r4d, 1 6129 shr r4d, 1 6130 add r4d, dword [esp+36] 6131 mov eax, r4d 6132 mov esp, r6 6133 RET 6134 6135cglobal pixel_sa8d_64x64, 4,7,8 6136 FIX_STRIDES r1, r3 6137 mov r6, esp 6138 and esp, ~15 6139 sub esp, 64 6140 6141 lea r4, [r1 + 2*r1] 6142 lea r5, [r3 + 2*r3] 6143 call pixel_sa8d_8x8_internal2 6144%if HIGH_BIT_DEPTH 6145 HADDUW m0, m1 6146%endif 6147 mova [rsp+48], m0 6148 call pixel_sa8d_8x8_internal2 6149 SA8D_INTER 6150 mova [esp+48], m0 6151 6152 mov r0, [r6+20] 6153 mov r2, [r6+28] 6154 add r0, 8*SIZEOF_PIXEL 6155 add r2, 8*SIZEOF_PIXEL 6156 call pixel_sa8d_8x8_internal2 6157 SA8D_INTER 6158 mova [esp+48], m0 6159 call pixel_sa8d_8x8_internal2 6160 SA8D_INTER 6161%if HIGH_BIT_DEPTH == 0 6162 HADDUW m0, m1 6163%endif 6164 movd r4d, m0 6165 add r4d, 1 6166 shr r4d, 1 6167 mov dword [esp+36], r4d 6168 6169 mov r0, [r6+20] 6170 mov r2, [r6+28] 6171 add r0, 16*SIZEOF_PIXEL 6172 add r2, 16*SIZEOF_PIXEL 6173 lea r4, [r1 + 2*r1] 6174 call pixel_sa8d_8x8_internal2 6175%if HIGH_BIT_DEPTH 6176 HADDUW m0, m1 6177%endif 6178 mova [esp+48], m0 6179 call pixel_sa8d_8x8_internal2 6180 SA8D_INTER 6181 mova [esp+48], m0 6182 6183 mov r0, [r6+20] 6184 mov r2, [r6+28] 6185 add r0, 24*SIZEOF_PIXEL 6186 add r2, 24*SIZEOF_PIXEL 6187 call pixel_sa8d_8x8_internal2 6188 SA8D_INTER 6189 mova [esp+64-mmsize], m0 6190 call pixel_sa8d_8x8_internal2 6191 AVG_16x16 6192 6193 mov r0, [r6+20] 6194 mov r2, [r6+28] 6195 add r0, 32*SIZEOF_PIXEL 6196 add r2, 32*SIZEOF_PIXEL 6197 lea r4, [r1 + 2*r1] 6198 call pixel_sa8d_8x8_internal2 6199%if HIGH_BIT_DEPTH 6200 HADDUW m0, m1 6201%endif 6202 mova [esp+48], m0 6203 call pixel_sa8d_8x8_internal2 6204 SA8D_INTER 6205 mova [esp+48], m0 6206 6207 mov r0, [r6+20] 6208 mov r2, [r6+28] 6209 add r0, 40*SIZEOF_PIXEL 6210 add r2, 40*SIZEOF_PIXEL 6211 call pixel_sa8d_8x8_internal2 6212 SA8D_INTER 6213 mova [esp+64-mmsize], m0 6214 call pixel_sa8d_8x8_internal2 6215 AVG_16x16 6216 6217 mov r0, [r6+20] 6218 mov r2, [r6+28] 6219 add r0, 48*SIZEOF_PIXEL 6220 add r2, 48*SIZEOF_PIXEL 6221 lea r4, [r1 + 2*r1] 6222 call pixel_sa8d_8x8_internal2 6223%if HIGH_BIT_DEPTH 6224 HADDUW m0, m1 6225%endif 6226 mova [esp+48], m0 6227 call pixel_sa8d_8x8_internal2 6228 SA8D_INTER 6229 mova [esp+48], m0 6230 6231 mov r0, [r6+20] 6232 mov r2, [r6+28] 6233 add r0, 56*SIZEOF_PIXEL 6234 add r2, 56*SIZEOF_PIXEL 6235 call pixel_sa8d_8x8_internal2 6236 SA8D_INTER 6237 mova [esp+64-mmsize], m0 6238 call pixel_sa8d_8x8_internal2 6239 AVG_16x16 6240 6241 mov r0, [r6+20] 6242 mov r2, [r6+28] 6243 lea r0, [r0 + r1*8] 6244 lea r2, [r2 + r3*8] 6245 lea r0, [r0 + r1*8] 6246 lea r2, [r2 + r3*8] 6247 mov [r6+20], r0 6248 mov [r6+28], r2 6249 6250 lea r4, [r1 + 2*r1] 6251 call pixel_sa8d_8x8_internal2 6252%if HIGH_BIT_DEPTH 6253 HADDUW m0, m1 6254%endif 6255 mova [esp+48], m0 6256 call pixel_sa8d_8x8_internal2 6257 SA8D_INTER 6258 mova [esp+48], m0 6259 6260 mov r0, [r6+20] 6261 mov r2, [r6+28] 6262 add r0, 8*SIZEOF_PIXEL 6263 add r2, 8*SIZEOF_PIXEL 6264 call pixel_sa8d_8x8_internal2 6265 SA8D_INTER 6266 mova [esp+64-mmsize], m0 6267 call pixel_sa8d_8x8_internal2 6268 AVG_16x16 6269 6270 mov r0, [r6+20] 6271 mov r2, [r6+28] 6272 add r0, 16*SIZEOF_PIXEL 6273 add r2, 16*SIZEOF_PIXEL 6274 lea r4, [r1 + 2*r1] 6275 call pixel_sa8d_8x8_internal2 6276%if HIGH_BIT_DEPTH 6277 HADDUW m0, m1 6278%endif 6279 mova [esp+48], m0 6280 call pixel_sa8d_8x8_internal2 6281 SA8D_INTER 6282 mova [esp+48], m0 6283 6284 mov r0, [r6+20] 6285 mov r2, [r6+28] 6286 add r0, 24*SIZEOF_PIXEL 6287 add r2, 24*SIZEOF_PIXEL 6288 call pixel_sa8d_8x8_internal2 6289 SA8D_INTER 6290 mova [esp+64-mmsize], m0 6291 call pixel_sa8d_8x8_internal2 6292 AVG_16x16 6293 6294 mov r0, [r6+20] 6295 mov r2, [r6+28] 6296 add r0, 32*SIZEOF_PIXEL 6297 add r2, 32*SIZEOF_PIXEL 6298 lea r4, [r1 + 2*r1] 6299 call pixel_sa8d_8x8_internal2 6300%if HIGH_BIT_DEPTH 6301 HADDUW m0, m1 6302%endif 6303 mova [esp+48], m0 6304 call pixel_sa8d_8x8_internal2 6305 SA8D_INTER 6306 mova [esp+48], m0 6307 6308 mov r0, [r6+20] 6309 mov r2, [r6+28] 6310 add r0, 40*SIZEOF_PIXEL 6311 add r2, 40*SIZEOF_PIXEL 6312 call pixel_sa8d_8x8_internal2 6313 SA8D_INTER 6314 mova [esp+64-mmsize], m0 6315 call pixel_sa8d_8x8_internal2 6316 AVG_16x16 6317 6318 mov r0, [r6+20] 6319 mov r2, [r6+28] 6320 add r0, 48*SIZEOF_PIXEL 6321 add r2, 48*SIZEOF_PIXEL 6322 lea r4, [r1 + 2*r1] 6323 call pixel_sa8d_8x8_internal2 6324%if HIGH_BIT_DEPTH 6325 HADDUW m0, m1 6326%endif 6327 mova [esp+48], m0 6328 call pixel_sa8d_8x8_internal2 6329 SA8D_INTER 6330 mova [esp+48], m0 6331 6332 mov r0, [r6+20] 6333 mov r2, [r6+28] 6334 add r0, 56*SIZEOF_PIXEL 6335 add r2, 56*SIZEOF_PIXEL 6336 call pixel_sa8d_8x8_internal2 6337 SA8D_INTER 6338 mova [esp+64-mmsize], m0 6339 call pixel_sa8d_8x8_internal2 6340 AVG_16x16 6341 6342 mov r0, [r6+20] 6343 mov r2, [r6+28] 6344 lea r0, [r0 + r1*8] 6345 lea r2, [r2 + r3*8] 6346 lea r0, [r0 + r1*8] 6347 lea r2, [r2 + r3*8] 6348 mov [r6+20], r0 6349 mov [r6+28], r2 6350 6351 lea r4, [r1 + 2*r1] 6352 call pixel_sa8d_8x8_internal2 6353%if HIGH_BIT_DEPTH 6354 HADDUW m0, m1 6355%endif 6356 mova [esp+48], m0 6357 call pixel_sa8d_8x8_internal2 6358 SA8D_INTER 6359 mova [esp+48], m0 6360 6361 mov r0, [r6+20] 6362 mov r2, [r6+28] 6363 add r0, 8*SIZEOF_PIXEL 6364 add r2, 8*SIZEOF_PIXEL 6365 call pixel_sa8d_8x8_internal2 6366 SA8D_INTER 6367 mova [esp+64-mmsize], m0 6368 call pixel_sa8d_8x8_internal2 6369 AVG_16x16 6370 6371 mov r0, [r6+20] 6372 mov r2, [r6+28] 6373 add r0, 16*SIZEOF_PIXEL 6374 add r2, 16*SIZEOF_PIXEL 6375 lea r4, [r1 + 2*r1] 6376 call pixel_sa8d_8x8_internal2 6377%if HIGH_BIT_DEPTH 6378 HADDUW m0, m1 6379%endif 6380 mova [esp+48], m0 6381 call pixel_sa8d_8x8_internal2 6382 SA8D_INTER 6383 mova [esp+48], m0 6384 6385 mov r0, [r6+20] 6386 mov r2, [r6+28] 6387 add r0, 24*SIZEOF_PIXEL 6388 add r2, 24*SIZEOF_PIXEL 6389 call pixel_sa8d_8x8_internal2 6390 SA8D_INTER 6391 mova [esp+64-mmsize], m0 6392 call pixel_sa8d_8x8_internal2 6393 AVG_16x16 6394 6395 mov r0, [r6+20] 6396 mov r2, [r6+28] 6397 add r0, 32*SIZEOF_PIXEL 6398 add r2, 32*SIZEOF_PIXEL 6399 lea r4, [r1 + 2*r1] 6400 call pixel_sa8d_8x8_internal2 6401%if HIGH_BIT_DEPTH 6402 HADDUW m0, m1 6403%endif 6404 mova [esp+48], m0 6405 call pixel_sa8d_8x8_internal2 6406 SA8D_INTER 6407 mova [esp+48], m0 6408 6409 mov r0, [r6+20] 6410 mov r2, [r6+28] 6411 add r0, 40*SIZEOF_PIXEL 6412 add r2, 40*SIZEOF_PIXEL 6413 call pixel_sa8d_8x8_internal2 6414 SA8D_INTER 6415 mova [esp+64-mmsize], m0 6416 call pixel_sa8d_8x8_internal2 6417 AVG_16x16 6418 6419 mov r0, [r6+20] 6420 mov r2, [r6+28] 6421 add r0, 48*SIZEOF_PIXEL 6422 add r2, 48*SIZEOF_PIXEL 6423 lea r4, [r1 + 2*r1] 6424 call pixel_sa8d_8x8_internal2 6425%if HIGH_BIT_DEPTH 6426 HADDUW m0, m1 6427%endif 6428 mova [esp+48], m0 6429 call pixel_sa8d_8x8_internal2 6430 SA8D_INTER 6431 mova [esp+48], m0 6432 6433 mov r0, [r6+20] 6434 mov r2, [r6+28] 6435 add r0, 56*SIZEOF_PIXEL 6436 add r2, 56*SIZEOF_PIXEL 6437 call pixel_sa8d_8x8_internal2 6438 SA8D_INTER 6439 mova [esp+64-mmsize], m0 6440 call pixel_sa8d_8x8_internal2 6441 AVG_16x16 6442 6443 mov r0, [r6+20] 6444 mov r2, [r6+28] 6445 lea r0, [r0 + r1*8] 6446 lea r2, [r2 + r3*8] 6447 lea r0, [r0 + r1*8] 6448 lea r2, [r2 + r3*8] 6449 mov [r6+20], r0 6450 mov [r6+28], r2 6451 6452 lea r4, [r1 + 2*r1] 6453 call pixel_sa8d_8x8_internal2 6454%if HIGH_BIT_DEPTH 6455 HADDUW m0, m1 6456%endif 6457 mova [esp+48], m0 6458 call pixel_sa8d_8x8_internal2 6459 SA8D_INTER 6460 mova [esp+48], m0 6461 6462 mov r0, [r6+20] 6463 mov r2, [r6+28] 6464 add r0, 8*SIZEOF_PIXEL 6465 add r2, 8*SIZEOF_PIXEL 6466 call pixel_sa8d_8x8_internal2 6467 SA8D_INTER 6468 mova [esp+64-mmsize], m0 6469 call pixel_sa8d_8x8_internal2 6470 AVG_16x16 6471 6472 mov r0, [r6+20] 6473 mov r2, [r6+28] 6474 add r0, 16*SIZEOF_PIXEL 6475 add r2, 16*SIZEOF_PIXEL 6476 lea r4, [r1 + 2*r1] 6477 call pixel_sa8d_8x8_internal2 6478%if HIGH_BIT_DEPTH 6479 HADDUW m0, m1 6480%endif 6481 mova [esp+48], m0 6482 call pixel_sa8d_8x8_internal2 6483 SA8D_INTER 6484 mova [esp+48], m0 6485 6486 mov r0, [r6+20] 6487 mov r2, [r6+28] 6488 add r0, 24*SIZEOF_PIXEL 6489 add r2, 24*SIZEOF_PIXEL 6490 call pixel_sa8d_8x8_internal2 6491 SA8D_INTER 6492 mova [esp+64-mmsize], m0 6493 call pixel_sa8d_8x8_internal2 6494 AVG_16x16 6495 6496 mov r0, [r6+20] 6497 mov r2, [r6+28] 6498 add r0, 32*SIZEOF_PIXEL 6499 add r2, 32*SIZEOF_PIXEL 6500 lea r4, [r1 + 2*r1] 6501 call pixel_sa8d_8x8_internal2 6502%if HIGH_BIT_DEPTH 6503 HADDUW m0, m1 6504%endif 6505 mova [esp+48], m0 6506 call pixel_sa8d_8x8_internal2 6507 SA8D_INTER 6508 mova [esp+48], m0 6509 6510 mov r0, [r6+20] 6511 mov r2, [r6+28] 6512 add r0, 40*SIZEOF_PIXEL 6513 add r2, 40*SIZEOF_PIXEL 6514 call pixel_sa8d_8x8_internal2 6515 SA8D_INTER 6516 mova [esp+64-mmsize], m0 6517 call pixel_sa8d_8x8_internal2 6518 AVG_16x16 6519 6520 mov r0, [r6+20] 6521 mov r2, [r6+28] 6522 add r0, 48*SIZEOF_PIXEL 6523 add r2, 48*SIZEOF_PIXEL 6524 lea r4, [r1 + 2*r1] 6525 call pixel_sa8d_8x8_internal2 6526%if HIGH_BIT_DEPTH 6527 HADDUW m0, m1 6528%endif 6529 mova [esp+48], m0 6530 call pixel_sa8d_8x8_internal2 6531 SA8D_INTER 6532 mova [esp+48], m0 6533 6534 mov r0, [r6+20] 6535 mov r2, [r6+28] 6536 add r0, 56*SIZEOF_PIXEL 6537 add r2, 56*SIZEOF_PIXEL 6538 call pixel_sa8d_8x8_internal2 6539 SA8D_INTER 6540 mova [esp+64-mmsize], m0 6541 call pixel_sa8d_8x8_internal2 6542 SA8D_INTER 6543%if HIGH_BIT_DEPTH == 0 6544 HADDUW m0, m1 6545%endif 6546 movd r4d, m0 6547 add r4d, 1 6548 shr r4d, 1 6549 add r4d, dword [esp+36] 6550 mov eax, r4d 6551 mov esp, r6 6552 RET 6553%endif ; !ARCH_X86_64 6554%endmacro ; SA8D 6555 6556 6557%if ARCH_X86_64 == 1 && BIT_DEPTH == 12 6558INIT_YMM avx2 6559cglobal sa8d_8x8_12bit 6560 pmovzxwd m0, [r0] 6561 pmovzxwd m9, [r2] 6562 psubd m0, m9 6563 6564 pmovzxwd m1, [r0 + r1] 6565 pmovzxwd m9, [r2 + r3] 6566 psubd m1, m9 6567 6568 pmovzxwd m2, [r0 + r1 * 2] 6569 pmovzxwd m9, [r2 + r3 * 2] 6570 psubd m2, m9 6571 6572 pmovzxwd m8, [r0 + r4] 6573 pmovzxwd m9, [r2 + r5] 6574 psubd m8, m9 6575 6576 lea r0, [r0 + r1 * 4] 6577 lea r2, [r2 + r3 * 4] 6578 6579 pmovzxwd m4, [r0] 6580 pmovzxwd m9, [r2] 6581 psubd m4, m9 6582 6583 pmovzxwd m5, [r0 + r1] 6584 pmovzxwd m9, [r2 + r3] 6585 psubd m5, m9 6586 6587 pmovzxwd m3, [r0 + r1 * 2] 6588 pmovzxwd m9, [r2 + r3 * 2] 6589 psubd m3, m9 6590 6591 pmovzxwd m7, [r0 + r4] 6592 pmovzxwd m9, [r2 + r5] 6593 psubd m7, m9 6594 6595 mova m6, m0 6596 paddd m0, m1 6597 psubd m1, m6 6598 mova m6, m2 6599 paddd m2, m8 6600 psubd m8, m6 6601 mova m6, m0 6602 6603 punpckldq m0, m1 6604 punpckhdq m6, m1 6605 6606 mova m1, m0 6607 paddd m0, m6 6608 psubd m6, m1 6609 mova m1, m2 6610 6611 punpckldq m2, m8 6612 punpckhdq m1, m8 6613 6614 mova m8, m2 6615 paddd m2, m1 6616 psubd m1, m8 6617 mova m8, m4 6618 paddd m4, m5 6619 psubd m5, m8 6620 mova m8, m3 6621 paddd m3, m7 6622 psubd m7, m8 6623 mova m8, m4 6624 6625 punpckldq m4, m5 6626 punpckhdq m8, m5 6627 6628 mova m5, m4 6629 paddd m4, m8 6630 psubd m8, m5 6631 mova m5, m3 6632 punpckldq m3, m7 6633 punpckhdq m5, m7 6634 6635 mova m7, m3 6636 paddd m3, m5 6637 psubd m5, m7 6638 mova m7, m0 6639 paddd m0, m2 6640 psubd m2, m7 6641 mova m7, m6 6642 paddd m6, m1 6643 psubd m1, m7 6644 mova m7, m0 6645 6646 punpcklqdq m0, m2 6647 punpckhqdq m7, m2 6648 6649 mova m2, m0 6650 paddd m0, m7 6651 psubd m7, m2 6652 mova m2, m6 6653 6654 punpcklqdq m6, m1 6655 punpckhqdq m2, m1 6656 6657 mova m1, m6 6658 paddd m6, m2 6659 psubd m2, m1 6660 mova m1, m4 6661 paddd m4, m3 6662 psubd m3, m1 6663 mova m1, m8 6664 paddd m8, m5 6665 psubd m5, m1 6666 mova m1, m4 6667 6668 punpcklqdq m4, m3 6669 punpckhqdq m1, m3 6670 6671 mova m3, m4 6672 paddd m4, m1 6673 psubd m1, m3 6674 mova m3, m8 6675 6676 punpcklqdq m8, m5 6677 punpckhqdq m3, m5 6678 6679 mova m5, m8 6680 paddd m8, m3 6681 psubd m3, m5 6682 mova m5, m0 6683 paddd m0, m4 6684 psubd m4, m5 6685 mova m5, m7 6686 paddd m7, m1 6687 psubd m1, m5 6688 mova m5, m0 6689 6690 vinserti128 m0, m0, xm4, 1 6691 vperm2i128 m5, m5, m4, 00110001b 6692 6693 pxor m4, m4 6694 psubd m4, m0 6695 pmaxsd m0, m4 6696 pxor m4, m4 6697 psubd m4, m5 6698 pmaxsd m5, m4 6699 pmaxsd m0, m5 6700 mova m4, m7 6701 6702 vinserti128 m7, m7, xm1, 1 6703 vperm2i128 m4, m4, m1, 00110001b 6704 6705 pxor m1, m1 6706 psubd m1, m7 6707 pmaxsd m7, m1 6708 pxor m1, m1 6709 psubd m1, m4 6710 pmaxsd m4, m1 6711 pmaxsd m7, m4 6712 mova m1, m6 6713 paddd m6, m8 6714 psubd m8, m1 6715 mova m1, m2 6716 paddd m2, m3 6717 psubd m3, m1 6718 mova m1, m6 6719 6720 vinserti128 m6, m6, xm8, 1 6721 vperm2i128 m1, m1, m8, 00110001b 6722 6723 pxor m8, m8 6724 psubd m8, m6 6725 pmaxsd m6, m8 6726 pxor m8, m8 6727 psubd m8, m1 6728 pmaxsd m1, m8 6729 pmaxsd m6, m1 6730 mova m8, m2 6731 6732 vinserti128 m2, m2, xm3, 1 6733 vperm2i128 m8, m8, m3, 00110001b 6734 6735 pxor m3, m3 6736 psubd m3, m2 6737 pmaxsd m2, m3 6738 pxor m3, m3 6739 psubd m3, m8 6740 pmaxsd m8, m3 6741 pmaxsd m2, m8 6742 paddd m0, m6 6743 paddd m0, m7 6744 paddd m0, m2 6745 ret 6746 6747cglobal pixel_sa8d_8x8, 4,6,10 6748 add r1d, r1d 6749 add r3d, r3d 6750 lea r4, [r1 + r1 * 2] 6751 lea r5, [r3 + r3 * 2] 6752 6753 call sa8d_8x8_12bit 6754 6755 vextracti128 xm6, m0, 1 6756 paddd xm0, xm6 6757 6758 movhlps xm6, xm0 6759 paddd xm0, xm6 6760 6761 pshuflw xm6, xm0, 0Eh 6762 paddd xm0, xm6 6763 movd eax, xm0 6764 add eax, 1 6765 shr eax, 1 6766 RET 6767 6768cglobal pixel_sa8d_8x16, 4,7,11 6769 add r1d, r1d 6770 add r3d, r3d 6771 lea r4, [r1 + r1 * 2] 6772 lea r5, [r3 + r3 * 2] 6773 pxor m10, m10 6774 6775 call sa8d_8x8_12bit 6776 6777 vextracti128 xm6, m0, 1 6778 paddd xm0, xm6 6779 6780 movhlps xm6, xm0 6781 paddd xm0, xm6 6782 6783 pshuflw xm6, xm0, 0Eh 6784 paddd xm0, xm6 6785 paddd xm0, [pd_1] 6786 psrld xm0, 1 6787 paddd xm10, xm0 6788 6789 lea r0, [r0 + r1 * 4] 6790 lea r2, [r2 + r3 * 4] 6791 call sa8d_8x8_12bit 6792 6793 vextracti128 xm6, m0, 1 6794 paddd xm0, xm6 6795 6796 movhlps xm6, xm0 6797 paddd xm0, xm6 6798 6799 pshuflw xm6, xm0, 0Eh 6800 paddd xm0, xm6 6801 paddd xm0, [pd_1] 6802 psrld xm0, 1 6803 paddd xm0, xm10 6804 movd eax, xm0 6805 RET 6806 6807cglobal pixel_sa8d_16x16, 4,8,11 6808 add r1d, r1d 6809 add r3d, r3d 6810 lea r4, [r1 + r1 * 2] 6811 lea r5, [r3 + r3 * 2] 6812 mov r6, r0 6813 mov r7, r2 6814 pxor m10, m10 6815 6816 call sa8d_8x8_12bit 6817 paddd m10, m0 6818 6819 lea r0, [r0 + r1 * 4] 6820 lea r2, [r2 + r3 * 4] 6821 call sa8d_8x8_12bit 6822 paddd m10, m0 6823 6824 lea r0, [r6 + 16] 6825 lea r2, [r7 + 16] 6826 call sa8d_8x8_12bit 6827 paddd m10, m0 6828 6829 lea r0, [r0 + r1 * 4] 6830 lea r2, [r2 + r3 * 4] 6831 call sa8d_8x8_12bit 6832 paddd m0, m10 6833 6834 vextracti128 xm6, m0, 1 6835 paddd xm0, xm6 6836 6837 movhlps xm6, xm0 6838 paddd xm0, xm6 6839 6840 pshuflw xm6, xm0, 0Eh 6841 paddd xm0, xm6 6842 movd eax, xm0 6843 add eax, 1 6844 shr eax, 1 6845 RET 6846 6847cglobal pixel_sa8d_16x32, 4,8,12 6848 add r1d, r1d 6849 add r3d, r3d 6850 lea r4, [r1 + r1 * 2] 6851 lea r5, [r3 + r3 * 2] 6852 mov r6, r0 6853 mov r7, r2 6854 pxor m10, m10 6855 pxor m11, m11 6856 6857 call sa8d_8x8_12bit 6858 paddd m10, m0 6859 6860 lea r0, [r0 + r1 * 4] 6861 lea r2, [r2 + r3 * 4] 6862 call sa8d_8x8_12bit 6863 paddd m10, m0 6864 6865 lea r0, [r6 + 16] 6866 lea r2, [r7 + 16] 6867 call sa8d_8x8_12bit 6868 paddd m10, m0 6869 6870 lea r0, [r0 + r1 * 4] 6871 lea r2, [r2 + r3 * 4] 6872 call sa8d_8x8_12bit 6873 paddd m0, m10 6874 6875 vextracti128 xm6, m0, 1 6876 paddd xm0, xm6 6877 6878 movhlps xm6, xm0 6879 paddd xm0, xm6 6880 6881 pshuflw xm6, xm0, 0Eh 6882 paddd xm0, xm6 6883 paddd xm0, [pd_1] 6884 psrld xm0, 1 6885 paddd xm11, xm0 6886 6887 lea r6, [r6 + r1 * 8] 6888 lea r6, [r6 + r1 * 8] 6889 lea r7, [r7 + r3 * 8] 6890 lea r7, [r7 + r3 * 8] 6891 pxor m10, m10 6892 mov r0, r6 6893 mov r2, r7 6894 call sa8d_8x8_12bit 6895 paddd m10, m0 6896 6897 lea r0, [r0 + r1 * 4] 6898 lea r2, [r2 + r3 * 4] 6899 call sa8d_8x8_12bit 6900 paddd m10, m0 6901 6902 lea r0, [r6 + 16] 6903 lea r2, [r7 + 16] 6904 call sa8d_8x8_12bit 6905 paddd m10, m0 6906 6907 lea r0, [r0 + r1 * 4] 6908 lea r2, [r2 + r3 * 4] 6909 call sa8d_8x8_12bit 6910 paddd m0, m10 6911 6912 vextracti128 xm6, m0, 1 6913 paddd xm0, xm6 6914 6915 movhlps xm6, xm0 6916 paddd xm0, xm6 6917 6918 pshuflw xm6, xm0, 0Eh 6919 paddd xm0, xm6 6920 paddd xm0, [pd_1] 6921 psrld xm0, 1 6922 paddd xm11, xm0 6923 movd eax, xm11 6924 RET 6925 6926cglobal pixel_sa8d_32x32, 4,8,12 6927 add r1d, r1d 6928 add r3d, r3d 6929 lea r4, [r1 + r1 * 2] 6930 lea r5, [r3 + r3 * 2] 6931 mov r6, r0 6932 mov r7, r2 6933 pxor m10, m10 6934 pxor m11, m11 6935 6936 call sa8d_8x8_12bit 6937 paddd m10, m0 6938 6939 lea r0, [r0 + r1 * 4] 6940 lea r2, [r2 + r3 * 4] 6941 call sa8d_8x8_12bit 6942 paddd m10, m0 6943 6944 lea r0, [r6 + 16] 6945 lea r2, [r7 + 16] 6946 call sa8d_8x8_12bit 6947 paddd m10, m0 6948 6949 lea r0, [r0 + r1 * 4] 6950 lea r2, [r2 + r3 * 4] 6951 call sa8d_8x8_12bit 6952 paddd m0, m10 6953 6954 vextracti128 xm6, m0, 1 6955 paddd xm0, xm6 6956 6957 movhlps xm6, xm0 6958 paddd xm0, xm6 6959 6960 pshuflw xm6, xm0, 0Eh 6961 paddd xm0, xm6 6962 paddd xm0, [pd_1] 6963 psrld xm0, 1 6964 paddd xm11, xm0 6965 6966 pxor m10, m10 6967 lea r0, [r6 + 32] 6968 lea r2, [r7 + 32] 6969 call sa8d_8x8_12bit 6970 paddd m10, m0 6971 6972 lea r0, [r0 + r1 * 4] 6973 lea r2, [r2 + r3 * 4] 6974 call sa8d_8x8_12bit 6975 paddd m10, m0 6976 6977 lea r0, [r6 + 48] 6978 lea r2, [r7 + 48] 6979 call sa8d_8x8_12bit 6980 paddd m10, m0 6981 6982 lea r0, [r0 + r1 * 4] 6983 lea r2, [r2 + r3 * 4] 6984 call sa8d_8x8_12bit 6985 paddd m0, m10 6986 6987 vextracti128 xm6, m0, 1 6988 paddd xm0, xm6 6989 6990 movhlps xm6, xm0 6991 paddd xm0, xm6 6992 6993 pshuflw xm6, xm0, 0Eh 6994 paddd xm0, xm6 6995 paddd xm0, [pd_1] 6996 psrld xm0, 1 6997 paddd xm11, xm0 6998 6999 lea r6, [r6 + r1 * 8] 7000 lea r6, [r6 + r1 * 8] 7001 lea r7, [r7 + r3 * 8] 7002 lea r7, [r7 + r3 * 8] 7003 pxor m10, m10 7004 mov r0, r6 7005 mov r2, r7 7006 call sa8d_8x8_12bit 7007 paddd m10, m0 7008 7009 lea r0, [r0 + r1 * 4] 7010 lea r2, [r2 + r3 * 4] 7011 call sa8d_8x8_12bit 7012 paddd m10, m0 7013 7014 lea r0, [r6 + 16] 7015 lea r2, [r7 + 16] 7016 call sa8d_8x8_12bit 7017 paddd m10, m0 7018 7019 lea r0, [r0 + r1 * 4] 7020 lea r2, [r2 + r3 * 4] 7021 call sa8d_8x8_12bit 7022 paddd m0, m10 7023 7024 vextracti128 xm6, m0, 1 7025 paddd xm0, xm6 7026 7027 movhlps xm6, xm0 7028 paddd xm0, xm6 7029 7030 pshuflw xm6, xm0, 0Eh 7031 paddd xm0, xm6 7032 paddd xm0, [pd_1] 7033 psrld xm0, 1 7034 paddd xm11, xm0 7035 7036 pxor m10, m10 7037 lea r0, [r6 + 32] 7038 lea r2, [r7 + 32] 7039 call sa8d_8x8_12bit 7040 paddd m10, m0 7041 7042 lea r0, [r0 + r1 * 4] 7043 lea r2, [r2 + r3 * 4] 7044 call sa8d_8x8_12bit 7045 paddd m10, m0 7046 7047 lea r0, [r6 + 48] 7048 lea r2, [r7 + 48] 7049 call sa8d_8x8_12bit 7050 paddd m10, m0 7051 7052 lea r0, [r0 + r1 * 4] 7053 lea r2, [r2 + r3 * 4] 7054 call sa8d_8x8_12bit 7055 paddd m0, m10 7056 7057 vextracti128 xm6, m0, 1 7058 paddd xm0, xm6 7059 7060 movhlps xm6, xm0 7061 paddd xm0, xm6 7062 7063 pshuflw xm6, xm0, 0Eh 7064 paddd xm0, xm6 7065 paddd xm0, [pd_1] 7066 psrld xm0, 1 7067 paddd xm11, xm0 7068 movd eax, xm11 7069 RET 7070 7071cglobal pixel_sa8d_32x64, 4,8,12 7072 add r1d, r1d 7073 add r3d, r3d 7074 lea r4, [r1 + r1 * 2] 7075 lea r5, [r3 + r3 * 2] 7076 mov r6, r0 7077 mov r7, r2 7078 pxor m10, m10 7079 pxor m11, m11 7080 7081 call sa8d_8x8_12bit 7082 paddd m10, m0 7083 7084 lea r0, [r0 + r1 * 4] 7085 lea r2, [r2 + r3 * 4] 7086 call sa8d_8x8_12bit 7087 paddd m10, m0 7088 7089 lea r0, [r6 + 16] 7090 lea r2, [r7 + 16] 7091 call sa8d_8x8_12bit 7092 paddd m10, m0 7093 7094 lea r0, [r0 + r1 * 4] 7095 lea r2, [r2 + r3 * 4] 7096 call sa8d_8x8_12bit 7097 paddd m0, m10 7098 7099 vextracti128 xm6, m0, 1 7100 paddd xm0, xm6 7101 7102 movhlps xm6, xm0 7103 paddd xm0, xm6 7104 7105 pshuflw xm6, xm0, 0Eh 7106 paddd xm0, xm6 7107 paddd xm0, [pd_1] 7108 psrld xm0, 1 7109 paddd xm11, xm0 7110 7111 pxor m10, m10 7112 lea r0, [r6 + 32] 7113 lea r2, [r7 + 32] 7114 call sa8d_8x8_12bit 7115 paddd m10, m0 7116 7117 lea r0, [r0 + r1 * 4] 7118 lea r2, [r2 + r3 * 4] 7119 call sa8d_8x8_12bit 7120 paddd m10, m0 7121 7122 lea r0, [r6 + 48] 7123 lea r2, [r7 + 48] 7124 call sa8d_8x8_12bit 7125 paddd m10, m0 7126 7127 lea r0, [r0 + r1 * 4] 7128 lea r2, [r2 + r3 * 4] 7129 call sa8d_8x8_12bit 7130 paddd m0, m10 7131 7132 vextracti128 xm6, m0, 1 7133 paddd xm0, xm6 7134 7135 movhlps xm6, xm0 7136 paddd xm0, xm6 7137 7138 pshuflw xm6, xm0, 0Eh 7139 paddd xm0, xm6 7140 paddd xm0, [pd_1] 7141 psrld xm0, 1 7142 paddd xm11, xm0 7143 7144 lea r6, [r6 + r1 * 8] 7145 lea r6, [r6 + r1 * 8] 7146 lea r7, [r7 + r3 * 8] 7147 lea r7, [r7 + r3 * 8] 7148 pxor m10, m10 7149 mov r0, r6 7150 mov r2, r7 7151 call sa8d_8x8_12bit 7152 paddd m10, m0 7153 7154 lea r0, [r0 + r1 * 4] 7155 lea r2, [r2 + r3 * 4] 7156 call sa8d_8x8_12bit 7157 paddd m10, m0 7158 7159 lea r0, [r6 + 16] 7160 lea r2, [r7 + 16] 7161 call sa8d_8x8_12bit 7162 paddd m10, m0 7163 7164 lea r0, [r0 + r1 * 4] 7165 lea r2, [r2 + r3 * 4] 7166 call sa8d_8x8_12bit 7167 paddd m0, m10 7168 7169 vextracti128 xm6, m0, 1 7170 paddd xm0, xm6 7171 7172 movhlps xm6, xm0 7173 paddd xm0, xm6 7174 7175 pshuflw xm6, xm0, 0Eh 7176 paddd xm0, xm6 7177 paddd xm0, [pd_1] 7178 psrld xm0, 1 7179 paddd xm11, xm0 7180 7181 pxor m10, m10 7182 lea r0, [r6 + 32] 7183 lea r2, [r7 + 32] 7184 call sa8d_8x8_12bit 7185 paddd m10, m0 7186 7187 lea r0, [r0 + r1 * 4] 7188 lea r2, [r2 + r3 * 4] 7189 call sa8d_8x8_12bit 7190 paddd m10, m0 7191 7192 lea r0, [r6 + 48] 7193 lea r2, [r7 + 48] 7194 call sa8d_8x8_12bit 7195 paddd m10, m0 7196 7197 lea r0, [r0 + r1 * 4] 7198 lea r2, [r2 + r3 * 4] 7199 call sa8d_8x8_12bit 7200 paddd m0, m10 7201 7202 vextracti128 xm6, m0, 1 7203 paddd xm0, xm6 7204 7205 movhlps xm6, xm0 7206 paddd xm0, xm6 7207 7208 pshuflw xm6, xm0, 0Eh 7209 paddd xm0, xm6 7210 paddd xm0, [pd_1] 7211 psrld xm0, 1 7212 paddd xm11, xm0 7213 7214 lea r6, [r6 + r1 * 8] 7215 lea r6, [r6 + r1 * 8] 7216 lea r7, [r7 + r3 * 8] 7217 lea r7, [r7 + r3 * 8] 7218 pxor m10, m10 7219 mov r0, r6 7220 mov r2, r7 7221 call sa8d_8x8_12bit 7222 paddd m10, m0 7223 7224 lea r0, [r0 + r1 * 4] 7225 lea r2, [r2 + r3 * 4] 7226 call sa8d_8x8_12bit 7227 paddd m10, m0 7228 7229 lea r0, [r6 + 16] 7230 lea r2, [r7 + 16] 7231 call sa8d_8x8_12bit 7232 paddd m10, m0 7233 7234 lea r0, [r0 + r1 * 4] 7235 lea r2, [r2 + r3 * 4] 7236 call sa8d_8x8_12bit 7237 paddd m0, m10 7238 7239 vextracti128 xm6, m0, 1 7240 paddd xm0, xm6 7241 7242 movhlps xm6, xm0 7243 paddd xm0, xm6 7244 7245 pshuflw xm6, xm0, 0Eh 7246 paddd xm0, xm6 7247 paddd xm0, [pd_1] 7248 psrld xm0, 1 7249 paddd xm11, xm0 7250 7251 pxor m10, m10 7252 lea r0, [r6 + 32] 7253 lea r2, [r7 + 32] 7254 call sa8d_8x8_12bit 7255 paddd m10, m0 7256 7257 lea r0, [r0 + r1 * 4] 7258 lea r2, [r2 + r3 * 4] 7259 call sa8d_8x8_12bit 7260 paddd m10, m0 7261 7262 lea r0, [r6 + 48] 7263 lea r2, [r7 + 48] 7264 call sa8d_8x8_12bit 7265 paddd m10, m0 7266 7267 lea r0, [r0 + r1 * 4] 7268 lea r2, [r2 + r3 * 4] 7269 call sa8d_8x8_12bit 7270 paddd m0, m10 7271 7272 vextracti128 xm6, m0, 1 7273 paddd xm0, xm6 7274 7275 movhlps xm6, xm0 7276 paddd xm0, xm6 7277 7278 pshuflw xm6, xm0, 0Eh 7279 paddd xm0, xm6 7280 paddd xm0, [pd_1] 7281 psrld xm0, 1 7282 paddd xm11, xm0 7283 7284 lea r6, [r6 + r1 * 8] 7285 lea r6, [r6 + r1 * 8] 7286 lea r7, [r7 + r3 * 8] 7287 lea r7, [r7 + r3 * 8] 7288 pxor m10, m10 7289 mov r0, r6 7290 mov r2, r7 7291 call sa8d_8x8_12bit 7292 paddd m10, m0 7293 7294 lea r0, [r0 + r1 * 4] 7295 lea r2, [r2 + r3 * 4] 7296 call sa8d_8x8_12bit 7297 paddd m10, m0 7298 7299 lea r0, [r6 + 16] 7300 lea r2, [r7 + 16] 7301 call sa8d_8x8_12bit 7302 paddd m10, m0 7303 7304 lea r0, [r0 + r1 * 4] 7305 lea r2, [r2 + r3 * 4] 7306 call sa8d_8x8_12bit 7307 paddd m0, m10 7308 7309 vextracti128 xm6, m0, 1 7310 paddd xm0, xm6 7311 7312 movhlps xm6, xm0 7313 paddd xm0, xm6 7314 7315 pshuflw xm6, xm0, 0Eh 7316 paddd xm0, xm6 7317 paddd xm0, [pd_1] 7318 psrld xm0, 1 7319 paddd xm11, xm0 7320 7321 pxor m10, m10 7322 lea r0, [r6 + 32] 7323 lea r2, [r7 + 32] 7324 call sa8d_8x8_12bit 7325 paddd m10, m0 7326 7327 lea r0, [r0 + r1 * 4] 7328 lea r2, [r2 + r3 * 4] 7329 call sa8d_8x8_12bit 7330 paddd m10, m0 7331 7332 lea r0, [r6 + 48] 7333 lea r2, [r7 + 48] 7334 call sa8d_8x8_12bit 7335 paddd m10, m0 7336 7337 lea r0, [r0 + r1 * 4] 7338 lea r2, [r2 + r3 * 4] 7339 call sa8d_8x8_12bit 7340 paddd m0, m10 7341 7342 vextracti128 xm6, m0, 1 7343 paddd xm0, xm6 7344 7345 movhlps xm6, xm0 7346 paddd xm0, xm6 7347 7348 pshuflw xm6, xm0, 0Eh 7349 paddd xm0, xm6 7350 paddd xm0, [pd_1] 7351 psrld xm0, 1 7352 paddd xm11, xm0 7353 movd eax, xm11 7354 RET 7355 7356cglobal pixel_sa8d_64x64, 4,8,12 7357 add r1d, r1d 7358 add r3d, r3d 7359 lea r4, [r1 + r1 * 2] 7360 lea r5, [r3 + r3 * 2] 7361 mov r6, r0 7362 mov r7, r2 7363 pxor m10, m10 7364 pxor m11, m11 7365 7366 call sa8d_8x8_12bit 7367 paddd m10, m0 7368 7369 lea r0, [r0 + r1 * 4] 7370 lea r2, [r2 + r3 * 4] 7371 call sa8d_8x8_12bit 7372 paddd m10, m0 7373 7374 lea r0, [r6 + 16] 7375 lea r2, [r7 + 16] 7376 call sa8d_8x8_12bit 7377 paddd m10, m0 7378 7379 lea r0, [r0 + r1 * 4] 7380 lea r2, [r2 + r3 * 4] 7381 call sa8d_8x8_12bit 7382 paddd m0, m10 7383 7384 vextracti128 xm6, m0, 1 7385 paddd xm0, xm6 7386 7387 movhlps xm6, xm0 7388 paddd xm0, xm6 7389 7390 pshuflw xm6, xm0, 0Eh 7391 paddd xm0, xm6 7392 paddd xm0, [pd_1] 7393 psrld xm0, 1 7394 paddd xm11, xm0 7395 7396 pxor m10, m10 7397 lea r0, [r6 + 32] 7398 lea r2, [r7 + 32] 7399 call sa8d_8x8_12bit 7400 paddd m10, m0 7401 7402 lea r0, [r0 + r1 * 4] 7403 lea r2, [r2 + r3 * 4] 7404 call sa8d_8x8_12bit 7405 paddd m10, m0 7406 7407 lea r0, [r6 + 48] 7408 lea r2, [r7 + 48] 7409 call sa8d_8x8_12bit 7410 paddd m10, m0 7411 7412 lea r0, [r0 + r1 * 4] 7413 lea r2, [r2 + r3 * 4] 7414 call sa8d_8x8_12bit 7415 paddd m0, m10 7416 7417 vextracti128 xm6, m0, 1 7418 paddd xm0, xm6 7419 7420 movhlps xm6, xm0 7421 paddd xm0, xm6 7422 7423 pshuflw xm6, xm0, 0Eh 7424 paddd xm0, xm6 7425 paddd xm0, [pd_1] 7426 psrld xm0, 1 7427 paddd xm11, xm0 7428 7429 pxor m10, m10 7430 lea r0, [r6 + 64] 7431 lea r2, [r7 + 64] 7432 call sa8d_8x8_12bit 7433 paddd m10, m0 7434 7435 lea r0, [r0 + r1 * 4] 7436 lea r2, [r2 + r3 * 4] 7437 call sa8d_8x8_12bit 7438 paddd m10, m0 7439 7440 lea r0, [r6 + 80] 7441 lea r2, [r7 + 80] 7442 call sa8d_8x8_12bit 7443 paddd m10, m0 7444 7445 lea r0, [r0 + r1 * 4] 7446 lea r2, [r2 + r3 * 4] 7447 call sa8d_8x8_12bit 7448 paddd m0, m10 7449 7450 vextracti128 xm6, m0, 1 7451 paddd xm0, xm6 7452 7453 movhlps xm6, xm0 7454 paddd xm0, xm6 7455 7456 pshuflw xm6, xm0, 0Eh 7457 paddd xm0, xm6 7458 paddd xm0, [pd_1] 7459 psrld xm0, 1 7460 paddd xm11, xm0 7461 7462 pxor m10, m10 7463 lea r0, [r6 + 96] 7464 lea r2, [r7 + 96] 7465 call sa8d_8x8_12bit 7466 paddd m10, m0 7467 7468 lea r0, [r0 + r1 * 4] 7469 lea r2, [r2 + r3 * 4] 7470 call sa8d_8x8_12bit 7471 paddd m10, m0 7472 7473 lea r0, [r6 + 112] 7474 lea r2, [r7 + 112] 7475 call sa8d_8x8_12bit 7476 paddd m10, m0 7477 7478 lea r0, [r0 + r1 * 4] 7479 lea r2, [r2 + r3 * 4] 7480 call sa8d_8x8_12bit 7481 paddd m0, m10 7482 7483 vextracti128 xm6, m0, 1 7484 paddd xm0, xm6 7485 7486 movhlps xm6, xm0 7487 paddd xm0, xm6 7488 7489 pshuflw xm6, xm0, 0Eh 7490 paddd xm0, xm6 7491 paddd xm0, [pd_1] 7492 psrld xm0, 1 7493 paddd xm11, xm0 7494 7495 lea r6, [r6 + r1 * 8] 7496 lea r6, [r6 + r1 * 8] 7497 lea r7, [r7 + r3 * 8] 7498 lea r7, [r7 + r3 * 8] 7499 pxor m10, m10 7500 mov r0, r6 7501 mov r2, r7 7502 call sa8d_8x8_12bit 7503 paddd m10, m0 7504 7505 lea r0, [r0 + r1 * 4] 7506 lea r2, [r2 + r3 * 4] 7507 call sa8d_8x8_12bit 7508 paddd m10, m0 7509 7510 lea r0, [r6 + 16] 7511 lea r2, [r7 + 16] 7512 call sa8d_8x8_12bit 7513 paddd m10, m0 7514 7515 lea r0, [r0 + r1 * 4] 7516 lea r2, [r2 + r3 * 4] 7517 call sa8d_8x8_12bit 7518 paddd m0, m10 7519 7520 vextracti128 xm6, m0, 1 7521 paddd xm0, xm6 7522 7523 movhlps xm6, xm0 7524 paddd xm0, xm6 7525 7526 pshuflw xm6, xm0, 0Eh 7527 paddd xm0, xm6 7528 paddd xm0, [pd_1] 7529 psrld xm0, 1 7530 paddd xm11, xm0 7531 7532 pxor m10, m10 7533 lea r0, [r6 + 32] 7534 lea r2, [r7 + 32] 7535 call sa8d_8x8_12bit 7536 paddd m10, m0 7537 7538 lea r0, [r0 + r1 * 4] 7539 lea r2, [r2 + r3 * 4] 7540 call sa8d_8x8_12bit 7541 paddd m10, m0 7542 7543 lea r0, [r6 + 48] 7544 lea r2, [r7 + 48] 7545 call sa8d_8x8_12bit 7546 paddd m10, m0 7547 7548 lea r0, [r0 + r1 * 4] 7549 lea r2, [r2 + r3 * 4] 7550 call sa8d_8x8_12bit 7551 paddd m0, m10 7552 7553 vextracti128 xm6, m0, 1 7554 paddd xm0, xm6 7555 7556 movhlps xm6, xm0 7557 paddd xm0, xm6 7558 7559 pshuflw xm6, xm0, 0Eh 7560 paddd xm0, xm6 7561 paddd xm0, [pd_1] 7562 psrld xm0, 1 7563 paddd xm11, xm0 7564 7565 pxor m10, m10 7566 lea r0, [r6 + 64] 7567 lea r2, [r7 + 64] 7568 call sa8d_8x8_12bit 7569 paddd m10, m0 7570 7571 lea r0, [r0 + r1 * 4] 7572 lea r2, [r2 + r3 * 4] 7573 call sa8d_8x8_12bit 7574 paddd m10, m0 7575 7576 lea r0, [r6 + 80] 7577 lea r2, [r7 + 80] 7578 call sa8d_8x8_12bit 7579 paddd m10, m0 7580 7581 lea r0, [r0 + r1 * 4] 7582 lea r2, [r2 + r3 * 4] 7583 call sa8d_8x8_12bit 7584 paddd m0, m10 7585 7586 vextracti128 xm6, m0, 1 7587 paddd xm0, xm6 7588 7589 movhlps xm6, xm0 7590 paddd xm0, xm6 7591 7592 pshuflw xm6, xm0, 0Eh 7593 paddd xm0, xm6 7594 paddd xm0, [pd_1] 7595 psrld xm0, 1 7596 paddd xm11, xm0 7597 7598 pxor m10, m10 7599 lea r0, [r6 + 96] 7600 lea r2, [r7 + 96] 7601 call sa8d_8x8_12bit 7602 paddd m10, m0 7603 7604 lea r0, [r0 + r1 * 4] 7605 lea r2, [r2 + r3 * 4] 7606 call sa8d_8x8_12bit 7607 paddd m10, m0 7608 7609 lea r0, [r6 + 112] 7610 lea r2, [r7 + 112] 7611 call sa8d_8x8_12bit 7612 paddd m10, m0 7613 7614 lea r0, [r0 + r1 * 4] 7615 lea r2, [r2 + r3 * 4] 7616 call sa8d_8x8_12bit 7617 paddd m0, m10 7618 7619 vextracti128 xm6, m0, 1 7620 paddd xm0, xm6 7621 7622 movhlps xm6, xm0 7623 paddd xm0, xm6 7624 7625 pshuflw xm6, xm0, 0Eh 7626 paddd xm0, xm6 7627 paddd xm0, [pd_1] 7628 psrld xm0, 1 7629 paddd xm11, xm0 7630 7631 lea r6, [r6 + r1 * 8] 7632 lea r6, [r6 + r1 * 8] 7633 lea r7, [r7 + r3 * 8] 7634 lea r7, [r7 + r3 * 8] 7635 pxor m10, m10 7636 mov r0, r6 7637 mov r2, r7 7638 call sa8d_8x8_12bit 7639 paddd m10, m0 7640 7641 lea r0, [r0 + r1 * 4] 7642 lea r2, [r2 + r3 * 4] 7643 call sa8d_8x8_12bit 7644 paddd m10, m0 7645 7646 lea r0, [r6 + 16] 7647 lea r2, [r7 + 16] 7648 call sa8d_8x8_12bit 7649 paddd m10, m0 7650 7651 lea r0, [r0 + r1 * 4] 7652 lea r2, [r2 + r3 * 4] 7653 call sa8d_8x8_12bit 7654 paddd m0, m10 7655 7656 vextracti128 xm6, m0, 1 7657 paddd xm0, xm6 7658 7659 movhlps xm6, xm0 7660 paddd xm0, xm6 7661 7662 pshuflw xm6, xm0, 0Eh 7663 paddd xm0, xm6 7664 paddd xm0, [pd_1] 7665 psrld xm0, 1 7666 paddd xm11, xm0 7667 7668 pxor m10, m10 7669 lea r0, [r6 + 32] 7670 lea r2, [r7 + 32] 7671 call sa8d_8x8_12bit 7672 paddd m10, m0 7673 7674 lea r0, [r0 + r1 * 4] 7675 lea r2, [r2 + r3 * 4] 7676 call sa8d_8x8_12bit 7677 paddd m10, m0 7678 7679 lea r0, [r6 + 48] 7680 lea r2, [r7 + 48] 7681 call sa8d_8x8_12bit 7682 paddd m10, m0 7683 7684 lea r0, [r0 + r1 * 4] 7685 lea r2, [r2 + r3 * 4] 7686 call sa8d_8x8_12bit 7687 paddd m0, m10 7688 7689 vextracti128 xm6, m0, 1 7690 paddd xm0, xm6 7691 7692 movhlps xm6, xm0 7693 paddd xm0, xm6 7694 7695 pshuflw xm6, xm0, 0Eh 7696 paddd xm0, xm6 7697 paddd xm0, [pd_1] 7698 psrld xm0, 1 7699 paddd xm11, xm0 7700 7701 pxor m10, m10 7702 lea r0, [r6 + 64] 7703 lea r2, [r7 + 64] 7704 call sa8d_8x8_12bit 7705 paddd m10, m0 7706 7707 lea r0, [r0 + r1 * 4] 7708 lea r2, [r2 + r3 * 4] 7709 call sa8d_8x8_12bit 7710 paddd m10, m0 7711 7712 lea r0, [r6 + 80] 7713 lea r2, [r7 + 80] 7714 call sa8d_8x8_12bit 7715 paddd m10, m0 7716 7717 lea r0, [r0 + r1 * 4] 7718 lea r2, [r2 + r3 * 4] 7719 call sa8d_8x8_12bit 7720 paddd m0, m10 7721 7722 vextracti128 xm6, m0, 1 7723 paddd xm0, xm6 7724 7725 movhlps xm6, xm0 7726 paddd xm0, xm6 7727 7728 pshuflw xm6, xm0, 0Eh 7729 paddd xm0, xm6 7730 paddd xm0, [pd_1] 7731 psrld xm0, 1 7732 paddd xm11, xm0 7733 7734 pxor m10, m10 7735 lea r0, [r6 + 96] 7736 lea r2, [r7 + 96] 7737 call sa8d_8x8_12bit 7738 paddd m10, m0 7739 7740 lea r0, [r0 + r1 * 4] 7741 lea r2, [r2 + r3 * 4] 7742 call sa8d_8x8_12bit 7743 paddd m10, m0 7744 7745 lea r0, [r6 + 112] 7746 lea r2, [r7 + 112] 7747 call sa8d_8x8_12bit 7748 paddd m10, m0 7749 7750 lea r0, [r0 + r1 * 4] 7751 lea r2, [r2 + r3 * 4] 7752 call sa8d_8x8_12bit 7753 paddd m0, m10 7754 7755 vextracti128 xm6, m0, 1 7756 paddd xm0, xm6 7757 7758 movhlps xm6, xm0 7759 paddd xm0, xm6 7760 7761 pshuflw xm6, xm0, 0Eh 7762 paddd xm0, xm6 7763 paddd xm0, [pd_1] 7764 psrld xm0, 1 7765 paddd xm11, xm0 7766 7767 lea r6, [r6 + r1 * 8] 7768 lea r6, [r6 + r1 * 8] 7769 lea r7, [r7 + r3 * 8] 7770 lea r7, [r7 + r3 * 8] 7771 pxor m10, m10 7772 mov r0, r6 7773 mov r2, r7 7774 call sa8d_8x8_12bit 7775 paddd m10, m0 7776 7777 lea r0, [r0 + r1 * 4] 7778 lea r2, [r2 + r3 * 4] 7779 call sa8d_8x8_12bit 7780 paddd m10, m0 7781 7782 lea r0, [r6 + 16] 7783 lea r2, [r7 + 16] 7784 call sa8d_8x8_12bit 7785 paddd m10, m0 7786 7787 lea r0, [r0 + r1 * 4] 7788 lea r2, [r2 + r3 * 4] 7789 call sa8d_8x8_12bit 7790 paddd m0, m10 7791 7792 vextracti128 xm6, m0, 1 7793 paddd xm0, xm6 7794 7795 movhlps xm6, xm0 7796 paddd xm0, xm6 7797 7798 pshuflw xm6, xm0, 0Eh 7799 paddd xm0, xm6 7800 paddd xm0, [pd_1] 7801 psrld xm0, 1 7802 paddd xm11, xm0 7803 7804 pxor m10, m10 7805 lea r0, [r6 + 32] 7806 lea r2, [r7 + 32] 7807 call sa8d_8x8_12bit 7808 paddd m10, m0 7809 7810 lea r0, [r0 + r1 * 4] 7811 lea r2, [r2 + r3 * 4] 7812 call sa8d_8x8_12bit 7813 paddd m10, m0 7814 7815 lea r0, [r6 + 48] 7816 lea r2, [r7 + 48] 7817 call sa8d_8x8_12bit 7818 paddd m10, m0 7819 7820 lea r0, [r0 + r1 * 4] 7821 lea r2, [r2 + r3 * 4] 7822 call sa8d_8x8_12bit 7823 paddd m0, m10 7824 7825 vextracti128 xm6, m0, 1 7826 paddd xm0, xm6 7827 7828 movhlps xm6, xm0 7829 paddd xm0, xm6 7830 7831 pshuflw xm6, xm0, 0Eh 7832 paddd xm0, xm6 7833 paddd xm0, [pd_1] 7834 psrld xm0, 1 7835 paddd xm11, xm0 7836 7837 pxor m10, m10 7838 lea r0, [r6 + 64] 7839 lea r2, [r7 + 64] 7840 call sa8d_8x8_12bit 7841 paddd m10, m0 7842 7843 lea r0, [r0 + r1 * 4] 7844 lea r2, [r2 + r3 * 4] 7845 call sa8d_8x8_12bit 7846 paddd m10, m0 7847 7848 lea r0, [r6 + 80] 7849 lea r2, [r7 + 80] 7850 call sa8d_8x8_12bit 7851 paddd m10, m0 7852 7853 lea r0, [r0 + r1 * 4] 7854 lea r2, [r2 + r3 * 4] 7855 call sa8d_8x8_12bit 7856 paddd m0, m10 7857 7858 vextracti128 xm6, m0, 1 7859 paddd xm0, xm6 7860 7861 movhlps xm6, xm0 7862 paddd xm0, xm6 7863 7864 pshuflw xm6, xm0, 0Eh 7865 paddd xm0, xm6 7866 paddd xm0, [pd_1] 7867 psrld xm0, 1 7868 paddd xm11, xm0 7869 7870 pxor m10, m10 7871 lea r0, [r6 + 96] 7872 lea r2, [r7 + 96] 7873 call sa8d_8x8_12bit 7874 paddd m10, m0 7875 7876 lea r0, [r0 + r1 * 4] 7877 lea r2, [r2 + r3 * 4] 7878 call sa8d_8x8_12bit 7879 paddd m10, m0 7880 7881 lea r0, [r6 + 112] 7882 lea r2, [r7 + 112] 7883 call sa8d_8x8_12bit 7884 paddd m10, m0 7885 7886 lea r0, [r0 + r1 * 4] 7887 lea r2, [r2 + r3 * 4] 7888 call sa8d_8x8_12bit 7889 paddd m0, m10 7890 7891 vextracti128 xm6, m0, 1 7892 paddd xm0, xm6 7893 7894 movhlps xm6, xm0 7895 paddd xm0, xm6 7896 7897 pshuflw xm6, xm0, 0Eh 7898 paddd xm0, xm6 7899 paddd xm0, [pd_1] 7900 psrld xm0, 1 7901 paddd xm11, xm0 7902 movd eax, xm11 7903 RET 7904%endif 7905 7906 7907;============================================================================= 7908; INTRA SATD 7909;============================================================================= 7910%define TRANS TRANS_SSE2 7911%define DIFFOP DIFF_UNPACK_SSE2 7912%define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P 7913%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2 7914%define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size 7915%define movdqu movups 7916%define punpcklqdq movlhps 7917INIT_XMM sse2 7918%if BIT_DEPTH <= 10 7919SA8D 7920%endif 7921SATDS_SSE2 7922 7923%if HIGH_BIT_DEPTH == 0 7924INIT_XMM ssse3,atom 7925SATDS_SSE2 7926SA8D 7927%endif 7928 7929%define DIFFOP DIFF_SUMSUB_SSSE3 7930%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE 7931%if HIGH_BIT_DEPTH == 0 7932%define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3 7933%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3 7934%endif 7935INIT_XMM ssse3 7936%if BIT_DEPTH <= 10 7937SA8D 7938%endif 7939SATDS_SSE2 7940%undef movdqa ; nehalem doesn't like movaps 7941%undef movdqu ; movups 7942%undef punpcklqdq ; or movlhps 7943 7944%define TRANS TRANS_SSE4 7945%define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN 7946INIT_XMM sse4 7947%if BIT_DEPTH <= 10 7948SA8D 7949%endif 7950SATDS_SSE2 7951 7952; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so 7953; it's effectively free. 7954%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE 7955INIT_XMM avx 7956SA8D 7957SATDS_SSE2 7958 7959%define TRANS TRANS_XOP 7960INIT_XMM xop 7961%if BIT_DEPTH <= 10 7962SA8D 7963%endif 7964SATDS_SSE2 7965 7966%if HIGH_BIT_DEPTH == 0 7967%define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2 7968%define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2 7969%define TRANS TRANS_SSE4 7970 7971%macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul] 7972 movddup xm%1, [r0] 7973 movddup xm%3, [r2] 7974 movddup xm%2, [r0+4*r1] 7975 movddup xm%5, [r2+4*r3] 7976 vinserti128 m%1, m%1, xm%2, 1 7977 vinserti128 m%3, m%3, xm%5, 1 7978 7979 movddup xm%2, [r0+r1] 7980 movddup xm%4, [r2+r3] 7981 movddup xm%5, [r0+r4] 7982 movddup xm%6, [r2+r5] 7983 vinserti128 m%2, m%2, xm%5, 1 7984 vinserti128 m%4, m%4, xm%6, 1 7985 7986 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7 7987 lea r0, [r0+2*r1] 7988 lea r2, [r2+2*r3] 7989 7990 movddup xm%3, [r0] 7991 movddup xm%5, [r0+4*r1] 7992 vinserti128 m%3, m%3, xm%5, 1 7993 7994 movddup xm%5, [r2] 7995 movddup xm%4, [r2+4*r3] 7996 vinserti128 m%5, m%5, xm%4, 1 7997 7998 movddup xm%4, [r0+r1] 7999 movddup xm%6, [r0+r4] 8000 vinserti128 m%4, m%4, xm%6, 1 8001 8002 movq xm%6, [r2+r3] 8003 movhps xm%6, [r2+r5] 8004 vpermq m%6, m%6, q1100 8005 DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7 8006%endmacro 8007 8008%macro SATD_START_AVX2 2-3 0 8009 FIX_STRIDES r1, r3 8010%if %3 8011 mova %2, [hmul_8p] 8012 lea r4, [5*r1] 8013 lea r5, [5*r3] 8014%else 8015 mova %2, [hmul_16p] 8016 lea r4, [3*r1] 8017 lea r5, [3*r3] 8018%endif 8019 pxor %1, %1 8020%endmacro 8021 8022%define TRANS TRANS_SSE4 8023INIT_YMM avx2 8024cglobal pixel_satd_16x8_internal 8025 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 8026 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 8027 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0 8028 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 8029 ret 8030 8031cglobal pixel_satd_16x16, 4,6,8 8032 SATD_START_AVX2 m6, m7 8033 call pixel_satd_16x8_internal 8034 lea r0, [r0+4*r1] 8035 lea r2, [r2+4*r3] 8036pixel_satd_16x8_internal: 8037 call pixel_satd_16x8_internal 8038 vextracti128 xm0, m6, 1 8039 paddw xm0, xm6 8040 SATD_END_SSE2 xm0 8041 RET 8042 8043cglobal pixel_satd_16x8, 4,6,8 8044 SATD_START_AVX2 m6, m7 8045 jmp pixel_satd_16x8_internal 8046 8047cglobal pixel_satd_8x8_internal 8048 LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 8049 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 8050 ret 8051 8052cglobal pixel_satd_8x16, 4,6,8 8053 SATD_START_AVX2 m6, m7, 1 8054 call pixel_satd_8x8_internal 8055 lea r0, [r0+2*r1] 8056 lea r2, [r2+2*r3] 8057 lea r0, [r0+4*r1] 8058 lea r2, [r2+4*r3] 8059 call pixel_satd_8x8_internal 8060 vextracti128 xm0, m6, 1 8061 paddw xm0, xm6 8062 SATD_END_SSE2 xm0 8063 RET 8064 8065cglobal pixel_satd_8x8, 4,6,8 8066 SATD_START_AVX2 m6, m7, 1 8067 call pixel_satd_8x8_internal 8068 vextracti128 xm0, m6, 1 8069 paddw xm0, xm6 8070 SATD_END_SSE2 xm0 8071 RET 8072 8073cglobal pixel_sa8d_8x8_internal 8074 LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 8075 HADAMARD4_V 0, 1, 2, 3, 4 8076 HADAMARD 8, sumsub, 0, 1, 4, 5 8077 HADAMARD 8, sumsub, 2, 3, 4, 5 8078 HADAMARD 2, sumsub, 0, 1, 4, 5 8079 HADAMARD 2, sumsub, 2, 3, 4, 5 8080 HADAMARD 1, amax, 0, 1, 4, 5 8081 HADAMARD 1, amax, 2, 3, 4, 5 8082 paddw m6, m0 8083 paddw m6, m2 8084 ret 8085 8086cglobal pixel_sa8d_8x8, 4,6,8 8087 SATD_START_AVX2 m6, m7, 1 8088 call pixel_sa8d_8x8_internal 8089 vextracti128 xm1, m6, 1 8090 paddw xm6, xm1 8091 HADDW xm6, xm1 8092 movd eax, xm6 8093 add eax, 1 8094 shr eax, 1 8095 RET 8096 8097cglobal pixel_sa8d_16x16, 4,6,8 8098 SATD_START_AVX2 m6, m7, 1 8099 8100 call pixel_sa8d_8x8_internal ; pix[0] 8101 8102 sub r0, r1 8103 sub r0, r1 8104 add r0, 8*SIZEOF_PIXEL 8105 sub r2, r3 8106 sub r2, r3 8107 add r2, 8*SIZEOF_PIXEL 8108 call pixel_sa8d_8x8_internal ; pix[8] 8109 8110 add r0, r4 8111 add r0, r1 8112 add r2, r5 8113 add r2, r3 8114 call pixel_sa8d_8x8_internal ; pix[8*stride+8] 8115 8116 sub r0, r1 8117 sub r0, r1 8118 sub r0, 8*SIZEOF_PIXEL 8119 sub r2, r3 8120 sub r2, r3 8121 sub r2, 8*SIZEOF_PIXEL 8122 call pixel_sa8d_8x8_internal ; pix[8*stride] 8123 8124 ; TODO: analyze Dynamic Range 8125 vextracti128 xm0, m6, 1 8126 paddusw xm6, xm0 8127 HADDUW xm6, xm0 8128 movd eax, xm6 8129 add eax, 1 8130 shr eax, 1 8131 RET 8132 8133cglobal pixel_sa8d_16x16_internal 8134 call pixel_sa8d_8x8_internal ; pix[0] 8135 8136 sub r0, r1 8137 sub r0, r1 8138 add r0, 8*SIZEOF_PIXEL 8139 sub r2, r3 8140 sub r2, r3 8141 add r2, 8*SIZEOF_PIXEL 8142 call pixel_sa8d_8x8_internal ; pix[8] 8143 8144 add r0, r4 8145 add r0, r1 8146 add r2, r5 8147 add r2, r3 8148 call pixel_sa8d_8x8_internal ; pix[8*stride+8] 8149 8150 sub r0, r1 8151 sub r0, r1 8152 sub r0, 8*SIZEOF_PIXEL 8153 sub r2, r3 8154 sub r2, r3 8155 sub r2, 8*SIZEOF_PIXEL 8156 call pixel_sa8d_8x8_internal ; pix[8*stride] 8157 8158 ; TODO: analyze Dynamic Range 8159 vextracti128 xm0, m6, 1 8160 paddusw xm6, xm0 8161 HADDUW xm6, xm0 8162 movd eax, xm6 8163 add eax, 1 8164 shr eax, 1 8165 ret 8166 8167%if ARCH_X86_64 8168cglobal pixel_sa8d_32x32, 4,8,8 8169 ; TODO: R6 is RAX on x64 platform, so we use it directly 8170 8171 SATD_START_AVX2 m6, m7, 1 8172 xor r7d, r7d 8173 8174 call pixel_sa8d_16x16_internal ; [0] 8175 pxor m6, m6 8176 add r7d, eax 8177 8178 add r0, r4 8179 add r0, r1 8180 add r2, r5 8181 add r2, r3 8182 call pixel_sa8d_16x16_internal ; [2] 8183 pxor m6, m6 8184 add r7d, eax 8185 8186 lea eax, [r4 * 5 - 16] 8187 sub r0, rax 8188 sub r0, r1 8189 lea eax, [r5 * 5 - 16] 8190 sub r2, rax 8191 sub r2, r3 8192 call pixel_sa8d_16x16_internal ; [1] 8193 pxor m6, m6 8194 add r7d, eax 8195 8196 add r0, r4 8197 add r0, r1 8198 add r2, r5 8199 add r2, r3 8200 call pixel_sa8d_16x16_internal ; [3] 8201 add eax, r7d 8202 RET 8203%endif ; ARCH_X86_64=1 8204%endif ; HIGH_BIT_DEPTH 8205 8206%macro SATD_AVX512_LOAD4 2 ; size, opmask 8207 vpbroadcast%1 m0, [r0] 8208 vpbroadcast%1 m0 {%2}, [r0+2*r1] 8209 vpbroadcast%1 m2, [r2] 8210 vpbroadcast%1 m2 {%2}, [r2+2*r3] 8211 add r0, r1 8212 add r2, r3 8213 vpbroadcast%1 m1, [r0] 8214 vpbroadcast%1 m1 {%2}, [r0+2*r1] 8215 vpbroadcast%1 m3, [r2] 8216 vpbroadcast%1 m3 {%2}, [r2+2*r3] 8217%endmacro 8218 8219%macro SATD_AVX512_LOAD8 5 ; size, halfreg, opmask1, opmask2, opmask3 8220 vpbroadcast%1 %{2}0, [r0] 8221 vpbroadcast%1 %{2}0 {%3}, [r0+2*r1] 8222 vpbroadcast%1 %{2}2, [r2] 8223 vpbroadcast%1 %{2}2 {%3}, [r2+2*r3] 8224 vpbroadcast%1 m0 {%4}, [r0+4*r1] 8225 vpbroadcast%1 m2 {%4}, [r2+4*r3] 8226 vpbroadcast%1 m0 {%5}, [r0+2*r4] 8227 vpbroadcast%1 m2 {%5}, [r2+2*r5] 8228 vpbroadcast%1 %{2}1, [r0+r1] 8229 vpbroadcast%1 %{2}1 {%3}, [r0+r4] 8230 vpbroadcast%1 %{2}3, [r2+r3] 8231 vpbroadcast%1 %{2}3 {%3}, [r2+r5] 8232 lea r0, [r0+4*r1] 8233 lea r2, [r2+4*r3] 8234 vpbroadcast%1 m1 {%4}, [r0+r1] 8235 vpbroadcast%1 m3 {%4}, [r2+r3] 8236 vpbroadcast%1 m1 {%5}, [r0+r4] 8237 vpbroadcast%1 m3 {%5}, [r2+r5] 8238%endmacro 8239 8240%macro SATD_AVX512_PACKED 0 8241 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 8242 SUMSUB_BA w, 0, 1, 2 8243 SBUTTERFLY qdq, 0, 1, 2 8244 SUMSUB_BA w, 0, 1, 2 8245 HMAXABSW2 0, 1, 2, 3 8246%endmacro 8247 8248%macro SATD_AVX512_END 0-1 0 ; sa8d 8249 paddw m0 {k1}{z}, m1 ; zero-extend to dwords 8250%if ARCH_X86_64 8251%if mmsize == 64 8252 vextracti32x8 ym1, m0, 1 8253 paddd ym0, ym1 8254%endif 8255%if mmsize >= 32 8256 vextracti128 xm1, ym0, 1 8257 paddd xmm0, xm0, xm1 8258%endif 8259 punpckhqdq xmm1, xmm0, xmm0 8260 paddd xmm0, xmm1 8261 movq rax, xmm0 8262 rorx rdx, rax, 32 8263%if %1 8264 lea eax, [rax+rdx+1] 8265 shr eax, 1 8266%else 8267 add eax, edx 8268%endif 8269%else 8270 HADDD m0, m1 8271 movd eax, xm0 8272%if %1 8273 inc eax 8274 shr eax, 1 8275%endif 8276%endif 8277 RET 8278%endmacro 8279 8280%macro HMAXABSW2 4 ; a, b, tmp1, tmp2 8281 pabsw m%1, m%1 8282 pabsw m%2, m%2 8283 psrldq m%3, m%1, 2 8284 psrld m%4, m%2, 16 8285 pmaxsw m%1, m%3 8286 pmaxsw m%2, m%4 8287%endmacro 8288%if HIGH_BIT_DEPTH==0 8289INIT_ZMM avx512 8290cglobal pixel_satd_16x8_internal 8291 vbroadcasti64x4 m6, [hmul_16p] 8292 kxnorb k2, k2, k2 8293 mov r4d, 0x55555555 8294 knotw k2, k2 8295 kmovd k1, r4d 8296 lea r4, [3*r1] 8297 lea r5, [3*r3] 8298satd_16x8_avx512: 8299 vbroadcasti128 ym0, [r0] 8300 vbroadcasti32x4 m0 {k2}, [r0+4*r1] ; 0 0 4 4 8301 vbroadcasti128 ym4, [r2] 8302 vbroadcasti32x4 m4 {k2}, [r2+4*r3] 8303 vbroadcasti128 ym2, [r0+2*r1] 8304 vbroadcasti32x4 m2 {k2}, [r0+2*r4] ; 2 2 6 6 8305 vbroadcasti128 ym5, [r2+2*r3] 8306 vbroadcasti32x4 m5 {k2}, [r2+2*r5] 8307 DIFF_SUMSUB_SSSE3 0, 4, 2, 5, 6 8308 vbroadcasti128 ym1, [r0+r1] 8309 vbroadcasti128 ym4, [r2+r3] 8310 vbroadcasti128 ym3, [r0+r4] 8311 vbroadcasti128 ym5, [r2+r5] 8312 lea r0, [r0+4*r1] 8313 lea r2, [r2+4*r3] 8314 vbroadcasti32x4 m1 {k2}, [r0+r1] ; 1 1 5 5 8315 vbroadcasti32x4 m4 {k2}, [r2+r3] 8316 vbroadcasti32x4 m3 {k2}, [r0+r4] ; 3 3 7 7 8317 vbroadcasti32x4 m5 {k2}, [r2+r5] 8318 DIFF_SUMSUB_SSSE3 1, 4, 3, 5, 6 8319 HADAMARD4_V 0, 1, 2, 3, 4 8320 HMAXABSW2 0, 2, 4, 5 8321 HMAXABSW2 1, 3, 4, 5 8322 paddw m4, m0, m2 ; m1 8323 paddw m2, m1, m3 ; m0 8324 ret 8325 8326cglobal pixel_satd_8x8_internal 8327 vbroadcasti64x4 m4, [hmul_16p] 8328 mov r4d, 0x55555555 8329 kmovd k1, r4d ; 01010101 8330 kshiftlb k2, k1, 5 ; 10100000 8331 kshiftlb k3, k1, 4 ; 01010000 8332 lea r4, [3*r1] 8333 lea r5, [3*r3] 8334satd_8x8_avx512: 8335 SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4 8336 SATD_AVX512_PACKED ; 3 1 3 1 7 5 7 5 8337 ret 8338 8339cglobal pixel_satd_16x8, 4,6 8340 call pixel_satd_16x8_internal_avx512 8341 jmp satd_zmm_avx512_end 8342 8343cglobal pixel_satd_16x16, 4,6 8344 call pixel_satd_16x8_internal_avx512 8345 lea r0, [r0+4*r1] 8346 lea r2, [r2+4*r3] 8347 paddw m7, m0, m1 8348 call satd_16x8_avx512 8349 paddw m1, m7 8350 jmp satd_zmm_avx512_end 8351 8352cglobal pixel_satd_8x8, 4,6 8353 call pixel_satd_8x8_internal_avx512 8354satd_zmm_avx512_end: 8355 SATD_AVX512_END 8356 8357cglobal pixel_satd_8x16, 4,6 8358 call pixel_satd_8x8_internal_avx512 8359 lea r0, [r0+4*r1] 8360 lea r2, [r2+4*r3] 8361 paddw m5, m0, m1 8362 call satd_8x8_avx512 8363 paddw m1, m5 8364 jmp satd_zmm_avx512_end 8365 8366INIT_YMM avx512 8367cglobal pixel_satd_4x8_internal 8368 vbroadcasti128 m4, [hmul_4p] 8369 mov r4d, 0x55550c 8370 kmovd k2, r4d ; 00001100 8371 kshiftlb k3, k2, 2 ; 00110000 8372 kshiftlb k4, k2, 4 ; 11000000 8373 kshiftrd k1, k2, 8 ; 01010101 8374 lea r4, [3*r1] 8375 lea r5, [3*r3] 8376satd_4x8_avx512: 8377 SATD_AVX512_LOAD8 d, xm, k2, k3, k4 ; 0 0 2 2 4 4 6 6 8378satd_ymm_avx512: ; 1 1 3 3 5 5 7 7 8379 SATD_AVX512_PACKED 8380 ret 8381 8382cglobal pixel_satd_8x4, 4,5 8383 mova m4, [hmul_16p] 8384 mov r4d, 0x5555 8385 kmovw k1, r4d 8386 SATD_AVX512_LOAD4 q, k1 ; 2 0 2 0 8387 call satd_ymm_avx512 ; 3 1 3 1 8388 jmp satd_ymm_avx512_end2 8389 8390cglobal pixel_satd_4x8, 4,6 8391 call pixel_satd_4x8_internal_avx512 8392satd_ymm_avx512_end: 8393%if ARCH_X86_64 == 0 8394 pop r5d 8395 %assign regs_used 5 8396%endif 8397satd_ymm_avx512_end2: 8398 SATD_AVX512_END 8399 8400cglobal pixel_satd_4x16, 4,6 8401 call pixel_satd_4x8_internal_avx512 8402 lea r0, [r0+4*r1] 8403 lea r2, [r2+4*r3] 8404 paddw m5, m0, m1 8405 call satd_4x8_avx512 8406 paddw m1, m5 8407 jmp satd_ymm_avx512_end 8408 8409INIT_XMM avx512 8410cglobal pixel_satd_4x4, 4,5 8411 mova m4, [hmul_4p] 8412 mov r4d, 0x550c 8413 kmovw k2, r4d 8414 kshiftrw k1, k2, 8 8415 SATD_AVX512_LOAD4 d, k2 ; 0 0 2 2 8416 SATD_AVX512_PACKED ; 1 1 3 3 8417 SWAP 0, 1 8418 SATD_AVX512_END 8419 8420INIT_ZMM avx512 8421cglobal pixel_sa8d_8x8, 4,6 8422 vbroadcasti64x4 m4, [hmul_16p] 8423 mov r4d, 0x55555555 8424 kmovd k1, r4d ; 01010101 8425 kshiftlb k2, k1, 5 ; 10100000 8426 kshiftlb k3, k1, 4 ; 01010000 8427 lea r4, [3*r1] 8428 lea r5, [3*r3] 8429 SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4 8430 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 ; 3 1 3 1 7 5 7 5 8431 SUMSUB_BA w, 0, 1, 2 8432 SBUTTERFLY qdq, 0, 1, 2 8433 SUMSUB_BA w, 0, 1, 2 8434 shufps m2, m0, m1, q2020 8435 shufps m1, m0, m1, q3131 8436 SUMSUB_BA w, 2, 1, 0 8437 vshufi32x4 m0, m2, m1, q1010 8438 vshufi32x4 m1, m2, m1, q3232 8439 SUMSUB_BA w, 0, 1, 2 8440 HMAXABSW2 0, 1, 2, 3 8441 SATD_AVX512_END 1 8442%endif 8443; Input 10bit, Output 8bit 8444;------------------------------------------------------------------------------------------------------------------------ 8445;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) 8446;------------------------------------------------------------------------------------------------------------------------ 8447INIT_XMM sse2 8448cglobal downShift_16, 4,7,3 8449 mov r4d, r4m 8450 mov r5d, r5m 8451 movd m0, r6m ; m0 = shift 8452 add r1, r1 8453 8454 dec r5d 8455.loopH: 8456 xor r6, r6 8457 8458.loopW: 8459 movu m1, [r0 + r6 * 2] 8460 movu m2, [r0 + r6 * 2 + mmsize] 8461 psrlw m1, m0 8462 psrlw m2, m0 8463 packuswb m1, m2 8464 movu [r2 + r6], m1 8465 8466 add r6, mmsize 8467 cmp r6d, r4d 8468 jl .loopW 8469 8470 ; move to next row 8471 add r0, r1 8472 add r2, r3 8473 dec r5d 8474 jnz .loopH 8475 8476 ;processing last row of every frame [To handle width which not a multiple of 16] 8477 ; r4d must be more than or equal to 16(mmsize) 8478.loop16: 8479 movu m1, [r0 + (r4 - mmsize) * 2] 8480 movu m2, [r0 + (r4 - mmsize) * 2 + mmsize] 8481 psrlw m1, m0 8482 psrlw m2, m0 8483 packuswb m1, m2 8484 movu [r2 + r4 - mmsize], m1 8485 8486 sub r4d, mmsize 8487 jz .end 8488 cmp r4d, mmsize 8489 jge .loop16 8490 8491 ; process partial pixels 8492 movu m1, [r0] 8493 movu m2, [r0 + mmsize] 8494 psrlw m1, m0 8495 psrlw m2, m0 8496 packuswb m1, m2 8497 movu [r2], m1 8498 8499.end: 8500 RET 8501 8502; Input 10bit, Output 8bit 8503;------------------------------------------------------------------------------------------------------------------------------------- 8504;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) 8505;------------------------------------------------------------------------------------------------------------------------------------- 8506INIT_YMM avx2 8507cglobal downShift_16, 4,7,3 8508 mov r4d, r4m 8509 mov r5d, r5m 8510 movd xm0, r6m ; m0 = shift 8511 add r1d, r1d 8512 8513 dec r5d 8514.loopH: 8515 xor r6, r6 8516 8517.loopW: 8518 movu m1, [r0 + r6 * 2 + 0] 8519 movu m2, [r0 + r6 * 2 + 32] 8520 vpsrlw m1, xm0 8521 vpsrlw m2, xm0 8522 packuswb m1, m2 8523 vpermq m1, m1, 11011000b 8524 movu [r2 + r6], m1 8525 8526 add r6d, mmsize 8527 cmp r6d, r4d 8528 jl .loopW 8529 8530 ; move to next row 8531 add r0, r1 8532 add r2, r3 8533 dec r5d 8534 jnz .loopH 8535 8536 ; processing last row of every frame [To handle width which not a multiple of 32] 8537 8538.loop32: 8539 movu m1, [r0 + (r4 - mmsize) * 2] 8540 movu m2, [r0 + (r4 - mmsize) * 2 + mmsize] 8541 psrlw m1, xm0 8542 psrlw m2, xm0 8543 packuswb m1, m2 8544 vpermq m1, m1, q3120 8545 movu [r2 + r4 - mmsize], m1 8546 8547 sub r4d, mmsize 8548 jz .end 8549 cmp r4d, mmsize 8550 jge .loop32 8551 8552 ; process partial pixels 8553 movu m1, [r0] 8554 movu m2, [r0 + mmsize] 8555 psrlw m1, xm0 8556 psrlw m2, xm0 8557 packuswb m1, m2 8558 vpermq m1, m1, q3120 8559 movu [r2], m1 8560 8561.end: 8562 RET 8563 8564; Input 8bit, Output 10bit 8565;--------------------------------------------------------------------------------------------------------------------- 8566;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift) 8567;--------------------------------------------------------------------------------------------------------------------- 8568INIT_XMM sse4 8569cglobal upShift_8, 6,7,3 8570 movd xm2, r6m 8571 add r3d, r3d 8572 dec r5d 8573 8574.loopH: 8575 xor r6, r6 8576.loopW: 8577 pmovzxbw m0,[r0 + r6] 8578 pmovzxbw m1,[r0 + r6 + mmsize/2] 8579 psllw m0, m2 8580 psllw m1, m2 8581 movu [r2 + r6 * 2], m0 8582 movu [r2 + r6 * 2 + mmsize], m1 8583 8584 add r6d, mmsize 8585 cmp r6d, r4d 8586 jl .loopW 8587 8588 ; move to next row 8589 add r0, r1 8590 add r2, r3 8591 dec r5d 8592 jg .loopH 8593 8594 ; processing last row of every frame [To handle width which not a multiple of 16] 8595 mov r1d, (mmsize/2 - 1) 8596 and r1d, r4d 8597 sub r1, mmsize/2 8598 8599 ; NOTE: Width MUST BE more than or equal to 8 8600 shr r4d, 3 ; log2(mmsize) 8601.loopW8: 8602 pmovzxbw m0,[r0] 8603 psllw m0, m2 8604 movu [r2], m0 8605 add r0, mmsize/2 8606 add r2, mmsize 8607 dec r4d 8608 jg .loopW8 8609 8610 ; Mac OS X can't read beyond array bound, so rollback some bytes 8611 pmovzxbw m0,[r0 + r1] 8612 psllw m0, m2 8613 movu [r2 + r1 * 2], m0 8614 RET 8615 8616 8617;--------------------------------------------------------------------------------------------------------------------- 8618;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift) 8619;--------------------------------------------------------------------------------------------------------------------- 8620%if ARCH_X86_64 8621INIT_YMM avx2 8622cglobal upShift_8, 6,7,3 8623 movd xm2, r6m 8624 add r3d, r3d 8625 dec r5d 8626 8627.loopH: 8628 xor r6, r6 8629.loopW: 8630 pmovzxbw m0,[r0 + r6] 8631 pmovzxbw m1,[r0 + r6 + mmsize/2] 8632 psllw m0, xm2 8633 psllw m1, xm2 8634 movu [r2 + r6 * 2], m0 8635 movu [r2 + r6 * 2 + mmsize], m1 8636 8637 add r6d, mmsize 8638 cmp r6d, r4d 8639 jl .loopW 8640 8641 ; move to next row 8642 add r0, r1 8643 add r2, r3 8644 dec r5d 8645 jg .loopH 8646 8647 ; processing last row of every frame [To handle width which not a multiple of 32] 8648 mov r1d, (mmsize/2 - 1) 8649 and r1d, r4d 8650 sub r1, mmsize/2 8651 8652 ; NOTE: Width MUST BE more than or equal to 16 8653 shr r4d, 4 ; log2(mmsize) 8654.loopW16: 8655 pmovzxbw m0,[r0] 8656 psllw m0, xm2 8657 movu [r2], m0 8658 add r0, mmsize/2 8659 add r2, mmsize 8660 dec r4d 8661 jg .loopW16 8662 8663 ; Mac OS X can't read beyond array bound, so rollback some bytes 8664 pmovzxbw m0,[r0 + r1] 8665 psllw m0, xm2 8666 movu [r2 + r1 * 2], m0 8667 RET 8668%endif 8669 8670%macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp 8671%if cpuflag(ssse3) 8672 pabsd %1, %3 8673 pabsd %2, %4 8674%elifidn %1, %3 8675 pxor %5, %5 8676 pxor %6, %6 8677 psubd %5, %1 8678 psubd %6, %2 8679 pmaxsd %1, %5 8680 pmaxsd %2, %6 8681%else 8682 pxor %1, %1 8683 pxor %2, %2 8684 psubd %1, %3 8685 psubd %2, %4 8686 pmaxsd %1, %3 8687 pmaxsd %2, %4 8688%endif 8689%endmacro 8690 8691 8692; Input 10bit, Output 12bit 8693;------------------------------------------------------------------------------------------------------------------------ 8694;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) 8695;------------------------------------------------------------------------------------------------------------------------ 8696INIT_XMM sse2 8697cglobal upShift_16, 4,7,4 8698 mov r4d, r4m 8699 mov r5d, r5m 8700 movd m0, r6m ; m0 = shift 8701 mova m3, [pw_pixel_max] 8702 FIX_STRIDES r1d, r3d 8703 dec r5d 8704.loopH: 8705 xor r6d, r6d 8706.loopW: 8707 movu m1, [r0 + r6 * SIZEOF_PIXEL] 8708 movu m2, [r0 + r6 * SIZEOF_PIXEL + mmsize] 8709 psllw m1, m0 8710 psllw m2, m0 8711 ; TODO: if input always valid, we can remove below 2 instructions. 8712 pand m1, m3 8713 pand m2, m3 8714 movu [r2 + r6 * SIZEOF_PIXEL], m1 8715 movu [r2 + r6 * SIZEOF_PIXEL + mmsize], m2 8716 8717 add r6, mmsize * 2 / SIZEOF_PIXEL 8718 cmp r6d, r4d 8719 jl .loopW 8720 8721 ; move to next row 8722 add r0, r1 8723 add r2, r3 8724 dec r5d 8725 jnz .loopH 8726 8727 ;processing last row of every frame [To handle width which not a multiple of 16] 8728 8729 ; WARNING: width(r4d) MUST BE more than or equal to 16(mmsize) in here 8730.loop16: 8731 movu m1, [r0 + (r4 - mmsize) * 2] 8732 movu m2, [r0 + (r4 - mmsize) * 2 + mmsize] 8733 psllw m1, m0 8734 psllw m2, m0 8735 pand m1, m3 8736 pand m2, m3 8737 movu [r2 + (r4 - mmsize) * 2], m1 8738 movu [r2 + (r4 - mmsize) * 2 + mmsize], m2 8739 8740 sub r4d, mmsize 8741 jz .end 8742 cmp r4d, mmsize 8743 jge .loop16 8744 8745 ; process partial pixels 8746 movu m1, [r0] 8747 movu m2, [r0 + mmsize] 8748 psllw m1, m0 8749 psllw m2, m0 8750 pand m1, m3 8751 pand m2, m3 8752 movu [r2], m1 8753 movu [r2 + mmsize], m2 8754 8755.end: 8756 RET 8757 8758; Input 10bit, Output 12bit 8759;------------------------------------------------------------------------------------------------------------------------------------- 8760;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) 8761;------------------------------------------------------------------------------------------------------------------------------------- 8762INIT_YMM avx2 8763cglobal upShift_16, 4,7,4 8764 mov r4d, r4m 8765 mov r5d, r5m 8766 movd xm0, r6m ; m0 = shift 8767 vbroadcasti128 m3, [pw_pixel_max] 8768 FIX_STRIDES r1d, r3d 8769 dec r5d 8770.loopH: 8771 xor r6d, r6d 8772.loopW: 8773 movu m1, [r0 + r6 * SIZEOF_PIXEL] 8774 movu m2, [r0 + r6 * SIZEOF_PIXEL + mmsize] 8775 psllw m1, xm0 8776 psllw m2, xm0 8777 pand m1, m3 8778 pand m2, m3 8779 movu [r2 + r6 * SIZEOF_PIXEL], m1 8780 movu [r2 + r6 * SIZEOF_PIXEL + mmsize], m2 8781 8782 add r6, mmsize * 2 / SIZEOF_PIXEL 8783 cmp r6d, r4d 8784 jl .loopW 8785 8786 ; move to next row 8787 add r0, r1 8788 add r2, r3 8789 dec r5d 8790 jnz .loopH 8791 8792 ; processing last row of every frame [To handle width which not a multiple of 32] 8793 8794.loop32: 8795 movu m1, [r0 + (r4 - mmsize) * 2] 8796 movu m2, [r0 + (r4 - mmsize) * 2 + mmsize] 8797 psllw m1, xm0 8798 psllw m2, xm0 8799 pand m1, m3 8800 pand m2, m3 8801 movu [r2 + (r4 - mmsize) * 2], m1 8802 movu [r2 + (r4 - mmsize) * 2 + mmsize], m2 8803 8804 sub r4d, mmsize 8805 jz .end 8806 cmp r4d, mmsize 8807 jge .loop32 8808 8809 ; process partial pixels 8810 movu m1, [r0] 8811 movu m2, [r0 + mmsize] 8812 psllw m1, xm0 8813 psllw m2, xm0 8814 pand m1, m3 8815 pand m2, m3 8816 movu [r2], m1 8817 movu [r2 + mmsize], m2 8818 8819.end: 8820 RET 8821INIT_ZMM avx512 8822cglobal upShift_16, 4,7,4 8823 mov r4d, r4m 8824 mov r5d, r5m 8825 movd xm0, r6m ; m0 = shift 8826 vbroadcasti32x4 m3, [pw_pixel_max] 8827 FIX_STRIDES r1d, r3d 8828 dec r5d 8829.loopH: 8830 xor r6d, r6d 8831.loopW: 8832 movu m1, [r0 + r6 * SIZEOF_PIXEL] 8833 psllw m1, xm0 8834 pand m1, m3 8835 movu [r2 + r6 * SIZEOF_PIXEL], m1 8836 8837 add r6, mmsize / SIZEOF_PIXEL 8838 cmp r6d, r4d 8839 jl .loopW 8840 8841 ; move to next row 8842 add r0, r1 8843 add r2, r3 8844 dec r5d 8845 jnz .loopH 8846 8847 ; processing last row of every frame [To handle width which not a multiple of 32] 8848 8849.loop32: 8850 movu m1, [r0 + (r4 - mmsize/2) * 2] 8851 psllw m1, xm0 8852 pand m1, m3 8853 movu [r2 + (r4 - mmsize/2) * 2], m1 8854 8855 sub r4d, mmsize/2 8856 jz .end 8857 cmp r4d, mmsize/2 8858 jge .loop32 8859 8860 ; process partial pixels 8861 movu m1, [r0] 8862 psllw m1, xm0 8863 pand m1, m3 8864 movu [r2], m1 8865 8866.end: 8867 RET 8868;--------------------------------------------------------------------------------------------------------------------- 8869;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride) 8870;--------------------------------------------------------------------------------------------------------------------- 8871INIT_XMM sse4 8872cglobal psyCost_pp_4x4, 4, 5, 8 8873 8874%if HIGH_BIT_DEPTH 8875 FIX_STRIDES r1, r3 8876 lea r4, [3 * r1] 8877 movddup m0, [r0] 8878 movddup m1, [r0 + r1] 8879 movddup m2, [r0 + r1 * 2] 8880 movddup m3, [r0 + r4] 8881 mova m4, [hmul_8w] 8882 pmaddwd m0, m4 8883 pmaddwd m1, m4 8884 pmaddwd m2, m4 8885 pmaddwd m3, m4 8886 8887 paddd m5, m0, m1 8888 paddd m5, m2 8889 paddd m5, m3 8890 psrldq m4, m5, 4 8891 paddd m5, m4 8892 psrld m5, 2 8893 8894 SUMSUB_BA d, 0, 1, 4 8895 SUMSUB_BA d, 2, 3, 4 8896 SUMSUB_BA d, 0, 2, 4 8897 SUMSUB_BA d, 1, 3, 4 8898 %define ORDER unord 8899 TRANS q, ORDER, 0, 2, 4, 6 8900 TRANS q, ORDER, 1, 3, 4, 6 8901 ABSD2 m0, m2, m0, m2, m4, m6 8902 pmaxsd m0, m2 8903 ABSD2 m1, m3, m1, m3, m4, m6 8904 pmaxsd m1, m3 8905 paddd m0, m1 8906 movhlps m1, m0 8907 paddd m0, m1 8908 psrldq m1, m0, 4 8909 paddd m0, m1 8910 8911 psubd m7, m0, m5 8912 8913 lea r4, [3 * r3] 8914 movddup m0, [r2] 8915 movddup m1, [r2 + r3] 8916 movddup m2, [r2 + r3 * 2] 8917 movddup m3, [r2 + r4] 8918 mova m4, [hmul_8w] 8919 pmaddwd m0, m4 8920 pmaddwd m1, m4 8921 pmaddwd m2, m4 8922 pmaddwd m3, m4 8923 8924 paddd m5, m0, m1 8925 paddd m5, m2 8926 paddd m5, m3 8927 psrldq m4, m5, 4 8928 paddd m5, m4 8929 psrld m5, 2 8930 8931 SUMSUB_BA d, 0, 1, 4 8932 SUMSUB_BA d, 2, 3, 4 8933 SUMSUB_BA d, 0, 2, 4 8934 SUMSUB_BA d, 1, 3, 4 8935 %define ORDER unord 8936 TRANS q, ORDER, 0, 2, 4, 6 8937 TRANS q, ORDER, 1, 3, 4, 6 8938 ABSD2 m0, m2, m0, m2, m4, m6 8939 pmaxsd m0, m2 8940 ABSD2 m1, m3, m1, m3, m4, m6 8941 pmaxsd m1, m3 8942 paddd m0, m1 8943 movhlps m1, m0 8944 paddd m0, m1 8945 psrldq m1, m0, 4 8946 paddd m0, m1 8947 8948 psubd m0, m5 8949 8950 psubd m7, m0 8951 pabsd m0, m7 8952 movd eax, m0 8953 8954%else ; !HIGH_BIT_DEPTH 8955 lea r4, [3 * r1] 8956 movd m0, [r0] 8957 movd m1, [r0 + r1] 8958 movd m2, [r0 + r1 * 2] 8959 movd m3, [r0 + r4] 8960 shufps m0, m1, 0 8961 shufps m2, m3, 0 8962 mova m4, [hmul_4p] 8963 pmaddubsw m0, m4 8964 pmaddubsw m2, m4 8965 8966 paddw m5, m0, m2 8967 movhlps m4, m5 8968 paddw m5, m4 8969 pmaddwd m5, [pw_1] 8970 psrld m5, 2 8971 8972 HADAMARD 0, sumsub, 0, 2, 1, 3 8973 HADAMARD 4, sumsub, 0, 2, 1, 3 8974 HADAMARD 1, amax, 0, 2, 1, 3 8975 HADDW m0, m2 8976 8977 psubd m6, m0, m5 8978 8979 lea r4, [3 * r3] 8980 movd m0, [r2] 8981 movd m1, [r2 + r3] 8982 movd m2, [r2 + r3 * 2] 8983 movd m3, [r2 + r4] 8984 shufps m0, m1, 0 8985 shufps m2, m3, 0 8986 mova m4, [hmul_4p] 8987 pmaddubsw m0, m4 8988 pmaddubsw m2, m4 8989 8990 paddw m5, m0, m2 8991 movhlps m4, m5 8992 paddw m5, m4 8993 pmaddwd m5, [pw_1] 8994 psrld m5, 2 8995 8996 HADAMARD 0, sumsub, 0, 2, 1, 3 8997 HADAMARD 4, sumsub, 0, 2, 1, 3 8998 HADAMARD 1, amax, 0, 2, 1, 3 8999 HADDW m0, m2 9000 9001 psubd m0, m5 9002 9003 psubd m6, m0 9004 pabsd m0, m6 9005 movd eax, m0 9006%endif ; HIGH_BIT_DEPTH 9007 RET 9008 9009%if ARCH_X86_64 9010INIT_XMM sse4 9011cglobal psyCost_pp_8x8, 4, 6, 13 9012 9013%if HIGH_BIT_DEPTH 9014 FIX_STRIDES r1, r3 9015 lea r4, [3 * r1] 9016 pxor m10, m10 9017 movu m0, [r0] 9018 movu m1, [r0 + r1] 9019 movu m2, [r0 + r1 * 2] 9020 movu m3, [r0 + r4] 9021 lea r5, [r0 + r1 * 4] 9022 movu m4, [r5] 9023 movu m5, [r5 + r1] 9024 movu m6, [r5 + r1 * 2] 9025 movu m7, [r5 + r4] 9026 9027 paddw m8, m0, m1 9028 paddw m8, m2 9029 paddw m8, m3 9030 paddw m8, m4 9031 paddw m8, m5 9032 paddw m8, m6 9033 paddw m8, m7 9034 pmaddwd m8, [pw_1] 9035 movhlps m9, m8 9036 paddd m8, m9 9037 psrldq m9, m8, 4 9038 paddd m8, m9 9039 psrld m8, 2 9040 9041 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax 9042 9043 paddd m0, m1 9044 paddd m0, m2 9045 paddd m0, m3 9046 HADDUW m0, m1 9047 paddd m0, [pd_1] 9048 psrld m0, 1 9049 psubd m10, m0, m8 9050 9051 lea r4, [3 * r3] 9052 movu m0, [r2] 9053 movu m1, [r2 + r3] 9054 movu m2, [r2 + r3 * 2] 9055 movu m3, [r2 + r4] 9056 lea r5, [r2 + r3 * 4] 9057 movu m4, [r5] 9058 movu m5, [r5 + r3] 9059 movu m6, [r5 + r3 * 2] 9060 movu m7, [r5 + r4] 9061 9062 paddw m8, m0, m1 9063 paddw m8, m2 9064 paddw m8, m3 9065 paddw m8, m4 9066 paddw m8, m5 9067 paddw m8, m6 9068 paddw m8, m7 9069 pmaddwd m8, [pw_1] 9070 movhlps m9, m8 9071 paddd m8, m9 9072 psrldq m9, m8, 4 9073 paddd m8, m9 9074 psrld m8, 2 9075 9076 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax 9077 9078 paddd m0, m1 9079 paddd m0, m2 9080 paddd m0, m3 9081 HADDUW m0, m1 9082 paddd m0, [pd_1] 9083 psrld m0, 1 9084 psubd m0, m8 9085 psubd m10, m0 9086 pabsd m0, m10 9087 movd eax, m0 9088%else ; !HIGH_BIT_DEPTH 9089 lea r4, [3 * r1] 9090 mova m8, [hmul_8p] 9091 9092 movddup m0, [r0] 9093 movddup m1, [r0 + r1] 9094 movddup m2, [r0 + r1 * 2] 9095 movddup m3, [r0 + r4] 9096 lea r5, [r0 + r1 * 4] 9097 movddup m4, [r5] 9098 movddup m5, [r5 + r1] 9099 movddup m6, [r5 + r1 * 2] 9100 movddup m7, [r5 + r4] 9101 9102 pmaddubsw m0, m8 9103 pmaddubsw m1, m8 9104 pmaddubsw m2, m8 9105 pmaddubsw m3, m8 9106 pmaddubsw m4, m8 9107 pmaddubsw m5, m8 9108 pmaddubsw m6, m8 9109 pmaddubsw m7, m8 9110 9111 paddw m11, m0, m1 9112 paddw m11, m2 9113 paddw m11, m3 9114 paddw m11, m4 9115 paddw m11, m5 9116 paddw m11, m6 9117 paddw m11, m7 9118 9119 pmaddwd m11, [pw_1] 9120 psrldq m10, m11, 4 9121 paddd m11, m10 9122 psrld m11, 2 9123 9124 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 9125 9126 paddw m0, m1 9127 paddw m0, m2 9128 paddw m0, m3 9129 HADDW m0, m1 9130 9131 paddd m0, [pd_1] 9132 psrld m0, 1 9133 psubd m12, m0, m11 9134 9135 lea r4, [3 * r3] 9136 9137 movddup m0, [r2] 9138 movddup m1, [r2 + r3] 9139 movddup m2, [r2 + r3 * 2] 9140 movddup m3, [r2 + r4] 9141 lea r5, [r2 + r3 * 4] 9142 movddup m4, [r5] 9143 movddup m5, [r5 + r3] 9144 movddup m6, [r5 + r3 * 2] 9145 movddup m7, [r5 + r4] 9146 9147 pmaddubsw m0, m8 9148 pmaddubsw m1, m8 9149 pmaddubsw m2, m8 9150 pmaddubsw m3, m8 9151 pmaddubsw m4, m8 9152 pmaddubsw m5, m8 9153 pmaddubsw m6, m8 9154 pmaddubsw m7, m8 9155 9156 paddw m11, m0, m1 9157 paddw m11, m2 9158 paddw m11, m3 9159 paddw m11, m4 9160 paddw m11, m5 9161 paddw m11, m6 9162 paddw m11, m7 9163 9164 pmaddwd m11, [pw_1] 9165 psrldq m10, m11, 4 9166 paddd m11, m10 9167 psrld m11, 2 9168 9169 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 9170 9171 paddw m0, m1 9172 paddw m0, m2 9173 paddw m0, m3 9174 HADDW m0, m1 9175 9176 paddd m0, [pd_1] 9177 psrld m0, 1 9178 psubd m0, m11 9179 psubd m12, m0 9180 pabsd m0, m12 9181 movd eax, m0 9182%endif ; HIGH_BIT_DEPTH 9183 RET 9184%endif 9185 9186%if ARCH_X86_64 9187%if HIGH_BIT_DEPTH 9188INIT_XMM sse4 9189cglobal psyCost_pp_16x16, 4, 9, 14 9190 9191 FIX_STRIDES r1, r3 9192 lea r4, [3 * r1] 9193 lea r8, [3 * r3] 9194 mova m12, [pw_1] 9195 mova m13, [pd_1] 9196 pxor m11, m11 9197 mov r7d, 2 9198.loopH: 9199 mov r6d, 2 9200.loopW: 9201 pxor m10, m10 9202 movu m0, [r0] 9203 movu m1, [r0 + r1] 9204 movu m2, [r0 + r1 * 2] 9205 movu m3, [r0 + r4] 9206 lea r5, [r0 + r1 * 4] 9207 movu m4, [r5] 9208 movu m5, [r5 + r1] 9209 movu m6, [r5 + r1 * 2] 9210 movu m7, [r5 + r4] 9211 9212 paddw m8, m0, m1 9213 paddw m8, m2 9214 paddw m8, m3 9215 paddw m8, m4 9216 paddw m8, m5 9217 paddw m8, m6 9218 paddw m8, m7 9219 pmaddwd m8, m12 9220 movhlps m9, m8 9221 paddd m8, m9 9222 psrldq m9, m8, 4 9223 paddd m8, m9 9224 psrld m8, 2 9225 9226 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax 9227 9228 paddd m0, m1 9229 paddd m0, m2 9230 paddd m0, m3 9231 HADDUW m0, m1 9232 paddd m0, m13 9233 psrld m0, 1 9234 psubd m10, m0, m8 9235 9236 movu m0, [r2] 9237 movu m1, [r2 + r3] 9238 movu m2, [r2 + r3 * 2] 9239 movu m3, [r2 + r8] 9240 lea r5, [r2 + r3 * 4] 9241 movu m4, [r5] 9242 movu m5, [r5 + r3] 9243 movu m6, [r5 + r3 * 2] 9244 movu m7, [r5 + r8] 9245 9246 paddw m8, m0, m1 9247 paddw m8, m2 9248 paddw m8, m3 9249 paddw m8, m4 9250 paddw m8, m5 9251 paddw m8, m6 9252 paddw m8, m7 9253 pmaddwd m8, m12 9254 movhlps m9, m8 9255 paddd m8, m9 9256 psrldq m9, m8, 4 9257 paddd m8, m9 9258 psrld m8, 2 9259 9260 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax 9261 9262 paddd m0, m1 9263 paddd m0, m2 9264 paddd m0, m3 9265 HADDUW m0, m1 9266 paddd m0, m13 9267 psrld m0, 1 9268 psubd m0, m8 9269 psubd m10, m0 9270 pabsd m0, m10 9271 paddd m11, m0 9272 add r0, 16 9273 add r2, 16 9274 dec r6d 9275 jnz .loopW 9276 lea r0, [r0 + r1 * 8 - 32] 9277 lea r2, [r2 + r3 * 8 - 32] 9278 dec r7d 9279 jnz .loopH 9280 movd eax, m11 9281 RET 9282%else ; !HIGH_BIT_DEPTH 9283INIT_XMM sse4 9284cglobal psyCost_pp_16x16, 4, 9, 15 9285 lea r4, [3 * r1] 9286 lea r8, [3 * r3] 9287 mova m8, [hmul_8p] 9288 mova m10, [pw_1] 9289 mova m14, [pd_1] 9290 pxor m13, m13 9291 mov r7d, 2 9292.loopH: 9293 mov r6d, 2 9294.loopW: 9295 pxor m12, m12 9296 movddup m0, [r0] 9297 movddup m1, [r0 + r1] 9298 movddup m2, [r0 + r1 * 2] 9299 movddup m3, [r0 + r4] 9300 lea r5, [r0 + r1 * 4] 9301 movddup m4, [r5] 9302 movddup m5, [r5 + r1] 9303 movddup m6, [r5 + r1 * 2] 9304 movddup m7, [r5 + r4] 9305 9306 pmaddubsw m0, m8 9307 pmaddubsw m1, m8 9308 pmaddubsw m2, m8 9309 pmaddubsw m3, m8 9310 pmaddubsw m4, m8 9311 pmaddubsw m5, m8 9312 pmaddubsw m6, m8 9313 pmaddubsw m7, m8 9314 9315 paddw m11, m0, m1 9316 paddw m11, m2 9317 paddw m11, m3 9318 paddw m11, m4 9319 paddw m11, m5 9320 paddw m11, m6 9321 paddw m11, m7 9322 9323 pmaddwd m11, m10 9324 psrldq m9, m11, 4 9325 paddd m11, m9 9326 psrld m11, 2 9327 9328 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 9329 9330 paddw m0, m1 9331 paddw m0, m2 9332 paddw m0, m3 9333 HADDW m0, m1 9334 9335 paddd m0, m14 9336 psrld m0, 1 9337 psubd m12, m0, m11 9338 9339 movddup m0, [r2] 9340 movddup m1, [r2 + r3] 9341 movddup m2, [r2 + r3 * 2] 9342 movddup m3, [r2 + r8] 9343 lea r5, [r2 + r3 * 4] 9344 movddup m4, [r5] 9345 movddup m5, [r5 + r3] 9346 movddup m6, [r5 + r3 * 2] 9347 movddup m7, [r5 + r8] 9348 9349 pmaddubsw m0, m8 9350 pmaddubsw m1, m8 9351 pmaddubsw m2, m8 9352 pmaddubsw m3, m8 9353 pmaddubsw m4, m8 9354 pmaddubsw m5, m8 9355 pmaddubsw m6, m8 9356 pmaddubsw m7, m8 9357 9358 paddw m11, m0, m1 9359 paddw m11, m2 9360 paddw m11, m3 9361 paddw m11, m4 9362 paddw m11, m5 9363 paddw m11, m6 9364 paddw m11, m7 9365 9366 pmaddwd m11, m10 9367 psrldq m9, m11, 4 9368 paddd m11, m9 9369 psrld m11, 2 9370 9371 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 9372 9373 paddw m0, m1 9374 paddw m0, m2 9375 paddw m0, m3 9376 HADDW m0, m1 9377 9378 paddd m0, m14 9379 psrld m0, 1 9380 psubd m0, m11 9381 psubd m12, m0 9382 pabsd m0, m12 9383 paddd m13, m0 9384 add r0, 8 9385 add r2, 8 9386 dec r6d 9387 jnz .loopW 9388 lea r0, [r0 + r1 * 8 - 16] 9389 lea r2, [r2 + r3 * 8 - 16] 9390 dec r7d 9391 jnz .loopH 9392 movd eax, m13 9393 RET 9394%endif ; HIGH_BIT_DEPTH 9395%endif 9396 9397%if ARCH_X86_64 9398%if HIGH_BIT_DEPTH 9399INIT_XMM sse4 9400cglobal psyCost_pp_32x32, 4, 9, 14 9401 9402 FIX_STRIDES r1, r3 9403 lea r4, [3 * r1] 9404 lea r8, [3 * r3] 9405 mova m12, [pw_1] 9406 mova m13, [pd_1] 9407 pxor m11, m11 9408 mov r7d, 4 9409.loopH: 9410 mov r6d, 4 9411.loopW: 9412 pxor m10, m10 9413 movu m0, [r0] 9414 movu m1, [r0 + r1] 9415 movu m2, [r0 + r1 * 2] 9416 movu m3, [r0 + r4] 9417 lea r5, [r0 + r1 * 4] 9418 movu m4, [r5] 9419 movu m5, [r5 + r1] 9420 movu m6, [r5 + r1 * 2] 9421 movu m7, [r5 + r4] 9422 9423 paddw m8, m0, m1 9424 paddw m8, m2 9425 paddw m8, m3 9426 paddw m8, m4 9427 paddw m8, m5 9428 paddw m8, m6 9429 paddw m8, m7 9430 pmaddwd m8, m12 9431 movhlps m9, m8 9432 paddd m8, m9 9433 psrldq m9, m8, 4 9434 paddd m8, m9 9435 psrld m8, 2 9436 9437 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax 9438 9439 paddd m0, m1 9440 paddd m0, m2 9441 paddd m0, m3 9442 HADDUW m0, m1 9443 paddd m0, m13 9444 psrld m0, 1 9445 psubd m10, m0, m8 9446 9447 movu m0, [r2] 9448 movu m1, [r2 + r3] 9449 movu m2, [r2 + r3 * 2] 9450 movu m3, [r2 + r8] 9451 lea r5, [r2 + r3 * 4] 9452 movu m4, [r5] 9453 movu m5, [r5 + r3] 9454 movu m6, [r5 + r3 * 2] 9455 movu m7, [r5 + r8] 9456 9457 paddw m8, m0, m1 9458 paddw m8, m2 9459 paddw m8, m3 9460 paddw m8, m4 9461 paddw m8, m5 9462 paddw m8, m6 9463 paddw m8, m7 9464 pmaddwd m8, m12 9465 movhlps m9, m8 9466 paddd m8, m9 9467 psrldq m9, m8, 4 9468 paddd m8, m9 9469 psrld m8, 2 9470 9471 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax 9472 9473 paddd m0, m1 9474 paddd m0, m2 9475 paddd m0, m3 9476 HADDUW m0, m1 9477 paddd m0, m13 9478 psrld m0, 1 9479 psubd m0, m8 9480 psubd m10, m0 9481 pabsd m0, m10 9482 paddd m11, m0 9483 add r0, 16 9484 add r2, 16 9485 dec r6d 9486 jnz .loopW 9487 lea r0, [r0 + r1 * 8 - 64] 9488 lea r2, [r2 + r3 * 8 - 64] 9489 dec r7d 9490 jnz .loopH 9491 movd eax, m11 9492 RET 9493 9494%else ; !HIGH_BIT_DEPTH 9495INIT_XMM sse4 9496cglobal psyCost_pp_32x32, 4, 9, 15 9497 9498 lea r4, [3 * r1] 9499 lea r8, [3 * r3] 9500 mova m8, [hmul_8p] 9501 mova m10, [pw_1] 9502 mova m14, [pd_1] 9503 pxor m13, m13 9504 mov r7d, 4 9505.loopH: 9506 mov r6d, 4 9507.loopW: 9508 pxor m12, m12 9509 movddup m0, [r0] 9510 movddup m1, [r0 + r1] 9511 movddup m2, [r0 + r1 * 2] 9512 movddup m3, [r0 + r4] 9513 lea r5, [r0 + r1 * 4] 9514 movddup m4, [r5] 9515 movddup m5, [r5 + r1] 9516 movddup m6, [r5 + r1 * 2] 9517 movddup m7, [r5 + r4] 9518 9519 pmaddubsw m0, m8 9520 pmaddubsw m1, m8 9521 pmaddubsw m2, m8 9522 pmaddubsw m3, m8 9523 pmaddubsw m4, m8 9524 pmaddubsw m5, m8 9525 pmaddubsw m6, m8 9526 pmaddubsw m7, m8 9527 9528 paddw m11, m0, m1 9529 paddw m11, m2 9530 paddw m11, m3 9531 paddw m11, m4 9532 paddw m11, m5 9533 paddw m11, m6 9534 paddw m11, m7 9535 9536 pmaddwd m11, m10 9537 psrldq m9, m11, 4 9538 paddd m11, m9 9539 psrld m11, 2 9540 9541 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 9542 9543 paddw m0, m1 9544 paddw m0, m2 9545 paddw m0, m3 9546 HADDW m0, m1 9547 9548 paddd m0, m14 9549 psrld m0, 1 9550 psubd m12, m0, m11 9551 9552 movddup m0, [r2] 9553 movddup m1, [r2 + r3] 9554 movddup m2, [r2 + r3 * 2] 9555 movddup m3, [r2 + r8] 9556 lea r5, [r2 + r3 * 4] 9557 movddup m4, [r5] 9558 movddup m5, [r5 + r3] 9559 movddup m6, [r5 + r3 * 2] 9560 movddup m7, [r5 + r8] 9561 9562 pmaddubsw m0, m8 9563 pmaddubsw m1, m8 9564 pmaddubsw m2, m8 9565 pmaddubsw m3, m8 9566 pmaddubsw m4, m8 9567 pmaddubsw m5, m8 9568 pmaddubsw m6, m8 9569 pmaddubsw m7, m8 9570 9571 paddw m11, m0, m1 9572 paddw m11, m2 9573 paddw m11, m3 9574 paddw m11, m4 9575 paddw m11, m5 9576 paddw m11, m6 9577 paddw m11, m7 9578 9579 pmaddwd m11, m10 9580 psrldq m9, m11, 4 9581 paddd m11, m9 9582 psrld m11, 2 9583 9584 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 9585 9586 paddw m0, m1 9587 paddw m0, m2 9588 paddw m0, m3 9589 HADDW m0, m1 9590 9591 paddd m0, m14 9592 psrld m0, 1 9593 psubd m0, m11 9594 psubd m12, m0 9595 pabsd m0, m12 9596 paddd m13, m0 9597 add r0, 8 9598 add r2, 8 9599 dec r6d 9600 jnz .loopW 9601 lea r0, [r0 + r1 * 8 - 32] 9602 lea r2, [r2 + r3 * 8 - 32] 9603 dec r7d 9604 jnz .loopH 9605 movd eax, m13 9606 RET 9607%endif ; HIGH_BIT_DEPTH 9608%endif 9609 9610%if ARCH_X86_64 9611%if HIGH_BIT_DEPTH 9612INIT_XMM sse4 9613cglobal psyCost_pp_64x64, 4, 9, 14 9614 9615 FIX_STRIDES r1, r3 9616 lea r4, [3 * r1] 9617 lea r8, [3 * r3] 9618 mova m12, [pw_1] 9619 mova m13, [pd_1] 9620 pxor m11, m11 9621 mov r7d, 8 9622.loopH: 9623 mov r6d, 8 9624.loopW: 9625 pxor m10, m10 9626 movu m0, [r0] 9627 movu m1, [r0 + r1] 9628 movu m2, [r0 + r1 * 2] 9629 movu m3, [r0 + r4] 9630 lea r5, [r0 + r1 * 4] 9631 movu m4, [r5] 9632 movu m5, [r5 + r1] 9633 movu m6, [r5 + r1 * 2] 9634 movu m7, [r5 + r4] 9635 9636 paddw m8, m0, m1 9637 paddw m8, m2 9638 paddw m8, m3 9639 paddw m8, m4 9640 paddw m8, m5 9641 paddw m8, m6 9642 paddw m8, m7 9643 pmaddwd m8, m12 9644 movhlps m9, m8 9645 paddd m8, m9 9646 psrldq m9, m8, 4 9647 paddd m8, m9 9648 psrld m8, 2 9649 9650 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax 9651 9652 paddd m0, m1 9653 paddd m0, m2 9654 paddd m0, m3 9655 HADDUW m0, m1 9656 paddd m0, m13 9657 psrld m0, 1 9658 psubd m10, m0, m8 9659 9660 movu m0, [r2] 9661 movu m1, [r2 + r3] 9662 movu m2, [r2 + r3 * 2] 9663 movu m3, [r2 + r8] 9664 lea r5, [r2 + r3 * 4] 9665 movu m4, [r5] 9666 movu m5, [r5 + r3] 9667 movu m6, [r5 + r3 * 2] 9668 movu m7, [r5 + r8] 9669 9670 paddw m8, m0, m1 9671 paddw m8, m2 9672 paddw m8, m3 9673 paddw m8, m4 9674 paddw m8, m5 9675 paddw m8, m6 9676 paddw m8, m7 9677 pmaddwd m8, m12 9678 movhlps m9, m8 9679 paddd m8, m9 9680 psrldq m9, m8, 4 9681 paddd m8, m9 9682 psrld m8, 2 9683 9684 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax 9685 9686 paddd m0, m1 9687 paddd m0, m2 9688 paddd m0, m3 9689 HADDUW m0, m1 9690 paddd m0, m13 9691 psrld m0, 1 9692 psubd m0, m8 9693 psubd m10, m0 9694 pabsd m0, m10 9695 paddd m11, m0 9696 add r0, 16 9697 add r2, 16 9698 dec r6d 9699 jnz .loopW 9700 lea r0, [r0 + r1 * 8 - 128] 9701 lea r2, [r2 + r3 * 8 - 128] 9702 dec r7d 9703 jnz .loopH 9704 movd eax, m11 9705 RET 9706 9707%else ; !HIGH_BIT_DEPTH 9708INIT_XMM sse4 9709cglobal psyCost_pp_64x64, 4, 9, 15 9710 9711 lea r4, [3 * r1] 9712 lea r8, [3 * r3] 9713 mova m8, [hmul_8p] 9714 mova m10, [pw_1] 9715 mova m14, [pd_1] 9716 pxor m13, m13 9717 mov r7d, 8 9718.loopH: 9719 mov r6d, 8 9720.loopW: 9721 pxor m12, m12 9722 movddup m0, [r0] 9723 movddup m1, [r0 + r1] 9724 movddup m2, [r0 + r1 * 2] 9725 movddup m3, [r0 + r4] 9726 lea r5, [r0 + r1 * 4] 9727 movddup m4, [r5] 9728 movddup m5, [r5 + r1] 9729 movddup m6, [r5 + r1 * 2] 9730 movddup m7, [r5 + r4] 9731 9732 pmaddubsw m0, m8 9733 pmaddubsw m1, m8 9734 pmaddubsw m2, m8 9735 pmaddubsw m3, m8 9736 pmaddubsw m4, m8 9737 pmaddubsw m5, m8 9738 pmaddubsw m6, m8 9739 pmaddubsw m7, m8 9740 9741 paddw m11, m0, m1 9742 paddw m11, m2 9743 paddw m11, m3 9744 paddw m11, m4 9745 paddw m11, m5 9746 paddw m11, m6 9747 paddw m11, m7 9748 9749 pmaddwd m11, m10 9750 psrldq m9, m11, 4 9751 paddd m11, m9 9752 psrld m11, 2 9753 9754 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 9755 9756 paddw m0, m1 9757 paddw m0, m2 9758 paddw m0, m3 9759 HADDW m0, m1 9760 9761 paddd m0, m14 9762 psrld m0, 1 9763 psubd m12, m0, m11 9764 9765 movddup m0, [r2] 9766 movddup m1, [r2 + r3] 9767 movddup m2, [r2 + r3 * 2] 9768 movddup m3, [r2 + r8] 9769 lea r5, [r2 + r3 * 4] 9770 movddup m4, [r5] 9771 movddup m5, [r5 + r3] 9772 movddup m6, [r5 + r3 * 2] 9773 movddup m7, [r5 + r8] 9774 9775 pmaddubsw m0, m8 9776 pmaddubsw m1, m8 9777 pmaddubsw m2, m8 9778 pmaddubsw m3, m8 9779 pmaddubsw m4, m8 9780 pmaddubsw m5, m8 9781 pmaddubsw m6, m8 9782 pmaddubsw m7, m8 9783 9784 paddw m11, m0, m1 9785 paddw m11, m2 9786 paddw m11, m3 9787 paddw m11, m4 9788 paddw m11, m5 9789 paddw m11, m6 9790 paddw m11, m7 9791 9792 pmaddwd m11, m10 9793 psrldq m9, m11, 4 9794 paddd m11, m9 9795 psrld m11, 2 9796 9797 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 9798 9799 paddw m0, m1 9800 paddw m0, m2 9801 paddw m0, m3 9802 HADDW m0, m1 9803 9804 paddd m0, m14 9805 psrld m0, 1 9806 psubd m0, m11 9807 psubd m12, m0 9808 pabsd m0, m12 9809 paddd m13, m0 9810 add r0, 8 9811 add r2, 8 9812 dec r6d 9813 jnz .loopW 9814 lea r0, [r0 + r1 * 8 - 64] 9815 lea r2, [r2 + r3 * 8 - 64] 9816 dec r7d 9817 jnz .loopH 9818 movd eax, m13 9819 RET 9820%endif ; HIGH_BIT_DEPTH 9821%endif 9822 9823INIT_YMM avx2 9824%if HIGH_BIT_DEPTH 9825cglobal psyCost_pp_4x4, 4, 5, 6 9826 add r1d, r1d 9827 add r3d, r3d 9828 lea r4, [r1 * 3] 9829 movddup xm0, [r0] 9830 movddup xm1, [r0 + r1] 9831 movddup xm2, [r0 + r1 * 2] 9832 movddup xm3, [r0 + r4] 9833 9834 lea r4, [r3 * 3] 9835 movddup xm4, [r2] 9836 movddup xm5, [r2 + r3] 9837 vinserti128 m0, m0, xm4, 1 9838 vinserti128 m1, m1, xm5, 1 9839 movddup xm4, [r2 + r3 * 2] 9840 movddup xm5, [r2 + r4] 9841 vinserti128 m2, m2, xm4, 1 9842 vinserti128 m3, m3, xm5, 1 9843 9844 mova m4, [hmul_8w] 9845 pmaddwd m0, m4 9846 pmaddwd m1, m4 9847 pmaddwd m2, m4 9848 pmaddwd m3, m4 9849 paddd m5, m0, m1 9850 paddd m4, m2, m3 9851 paddd m5, m4 9852 psrldq m4, m5, 4 9853 paddd m5, m4 9854 psrld m5, 2 9855 9856 mova m4, m0 9857 paddd m0, m1 9858 psubd m1, m4 9859 mova m4, m2 9860 paddd m2, m3 9861 psubd m3, m4 9862 mova m4, m0 9863 paddd m0, m2 9864 psubd m2, m4 9865 mova m4, m1 9866 paddd m1, m3 9867 psubd m3, m4 9868 movaps m4, m0 9869 vshufps m4, m4, m2, 11011101b 9870 vshufps m0, m0, m2, 10001000b 9871 movaps m2, m1 9872 vshufps m2, m2, m3, 11011101b 9873 vshufps m1, m1, m3, 10001000b 9874 pabsd m0, m0 9875 pabsd m4, m4 9876 pmaxsd m0, m4 9877 pabsd m1, m1 9878 pabsd m2, m2 9879 pmaxsd m1, m2 9880 paddd m0, m1 9881 9882 vpermq m1, m0, 11110101b 9883 paddd m0, m1 9884 psrldq m1, m0, 4 9885 paddd m0, m1 9886 psubd m0, m5 9887 9888 vextracti128 xm1, m0, 1 9889 psubd xm1, xm0 9890 pabsd xm1, xm1 9891 movd eax, xm1 9892 RET 9893%else ; !HIGH_BIT_DEPTH 9894cglobal psyCost_pp_4x4, 4, 5, 6 9895 lea r4, [3 * r1] 9896 movd xm0, [r0] 9897 movd xm1, [r0 + r1] 9898 movd xm2, [r0 + r1 * 2] 9899 movd xm3, [r0 + r4] 9900 vshufps xm0, xm1, 0 9901 vshufps xm2, xm3, 0 9902 9903 lea r4, [3 * r3] 9904 movd xm1, [r2] 9905 movd xm3, [r2 + r3] 9906 movd xm4, [r2 + r3 * 2] 9907 movd xm5, [r2 + r4] 9908 vshufps xm1, xm3, 0 9909 vshufps xm4, xm5, 0 9910 9911 vinserti128 m0, m0, xm1, 1 9912 vinserti128 m2, m2, xm4, 1 9913 9914 mova m4, [hmul_4p] 9915 pmaddubsw m0, m4 9916 pmaddubsw m2, m4 9917 9918 paddw m5, m0, m2 9919 mova m1, m5 9920 psrldq m4, m5, 8 9921 paddw m5, m4 9922 pmaddwd m5, [pw_1] 9923 psrld m5, 2 9924 9925 vpsubw m2, m2, m0 9926 vpunpckhqdq m0, m1, m2 9927 vpunpcklqdq m1, m1, m2 9928 vpaddw m2, m1, m0 9929 vpsubw m0, m0, m1 9930 vpblendw m1, m2, m0, 10101010b 9931 vpslld m0, m0, 10h 9932 vpsrld m2, m2, 10h 9933 vpor m0, m0, m2 9934 vpabsw m1, m1 9935 vpabsw m0, m0 9936 vpmaxsw m1, m1, m0 9937 vpmaddwd m1, m1, [pw_1] 9938 psrldq m2, m1, 8 9939 paddd m1, m2 9940 psrldq m3, m1, 4 9941 paddd m1, m3 9942 psubd m1, m5 9943 vextracti128 xm2, m1, 1 9944 psubd m1, m2 9945 pabsd m1, m1 9946 movd eax, xm1 9947 RET 9948%endif 9949 9950%macro PSY_PP_8x8 0 9951 movddup m0, [r0 + r1 * 0] 9952 movddup m1, [r0 + r1 * 1] 9953 movddup m2, [r0 + r1 * 2] 9954 movddup m3, [r0 + r4 * 1] 9955 9956 lea r5, [r0 + r1 * 4] 9957 9958 movddup m4, [r2 + r3 * 0] 9959 movddup m5, [r2 + r3 * 1] 9960 movddup m6, [r2 + r3 * 2] 9961 movddup m7, [r2 + r7 * 1] 9962 9963 lea r6, [r2 + r3 * 4] 9964 9965 vinserti128 m0, m0, xm4, 1 9966 vinserti128 m1, m1, xm5, 1 9967 vinserti128 m2, m2, xm6, 1 9968 vinserti128 m3, m3, xm7, 1 9969 9970 movddup m4, [r5 + r1 * 0] 9971 movddup m5, [r5 + r1 * 1] 9972 movddup m6, [r5 + r1 * 2] 9973 movddup m7, [r5 + r4 * 1] 9974 9975 movddup m9, [r6 + r3 * 0] 9976 movddup m10, [r6 + r3 * 1] 9977 movddup m11, [r6 + r3 * 2] 9978 movddup m12, [r6 + r7 * 1] 9979 9980 vinserti128 m4, m4, xm9, 1 9981 vinserti128 m5, m5, xm10, 1 9982 vinserti128 m6, m6, xm11, 1 9983 vinserti128 m7, m7, xm12, 1 9984 9985 pmaddubsw m0, m8 9986 pmaddubsw m1, m8 9987 pmaddubsw m2, m8 9988 pmaddubsw m3, m8 9989 pmaddubsw m4, m8 9990 pmaddubsw m5, m8 9991 pmaddubsw m6, m8 9992 pmaddubsw m7, m8 9993 9994 paddw m11, m0, m1 9995 paddw m11, m2 9996 paddw m11, m3 9997 paddw m11, m4 9998 paddw m11, m5 9999 paddw m11, m6 10000 paddw m11, m7 10001 10002 pmaddwd m11, [pw_1] 10003 psrldq m10, m11, 4 10004 paddd m11, m10 10005 psrld m11, 2 10006 10007 mova m9, m0 10008 paddw m0, m1 ; m0+m1 10009 psubw m1, m9 ; m1-m0 10010 mova m9, m2 10011 paddw m2, m3 ; m2+m3 10012 psubw m3, m9 ; m3-m2 10013 mova m9, m0 10014 paddw m0, m2 ; m0+m1+m2+m3 10015 psubw m2, m9 ; m2+m3-m0+m1 10016 mova m9, m1 10017 paddw m1, m3 ; m1-m0+m3-m2 10018 psubw m3, m9 ; m3-m2-m1-m0 10019 10020 movdqa m9, m4 10021 paddw m4, m5 ; m4+m5 10022 psubw m5, m9 ; m5-m4 10023 movdqa m9, m6 10024 paddw m6, m7 ; m6+m7 10025 psubw m7, m9 ; m7-m6 10026 movdqa m9, m4 10027 paddw m4, m6 ; m4+m5+m6+m7 10028 psubw m6, m9 ; m6+m7-m4+m5 10029 movdqa m9, m5 10030 paddw m5, m7 ; m5-m4+m7-m6 10031 psubw m7, m9 ; m7-m6-m5-m4 10032 10033 movdqa m9, m0 10034 paddw m0, m4 ; (m0+m1+m2+m3)+(m4+m5+m6+m7) 10035 psubw m4, m9 ; (m4+m5+m6+m7)-(m0+m1+m2+m3) 10036 movdqa m9, m1 10037 paddw m1, m5 ; (m1-m0+m3-m2)+(m5-m4+m7-m6) 10038 psubw m5, m9 ; (m5-m4+m7-m6)-(m1-m0+m3-m2) 10039 10040 mova m9, m0 10041 vshufps m9, m9, m4, 11011101b 10042 vshufps m0, m0, m4, 10001000b 10043 10044 movdqa m4, m0 10045 paddw m0, m9 ; (a0 + a4) + (a4 - a0) 10046 psubw m9, m4 ; (a0 + a4) - (a4 - a0) == (a0 + a4) + (a0 - a4) 10047 10048 movaps m4, m1 10049 vshufps m4, m4, m5, 11011101b 10050 vshufps m1, m1, m5, 10001000b 10051 10052 movdqa m5, m1 10053 paddw m1, m4 10054 psubw m4, m5 10055 movdqa m5, m2 10056 paddw m2, m6 10057 psubw m6, m5 10058 movdqa m5, m3 10059 paddw m3, m7 10060 psubw m7, m5 10061 10062 movaps m5, m2 10063 vshufps m5, m5, m6, 11011101b 10064 vshufps m2, m2, m6, 10001000b 10065 10066 movdqa m6, m2 10067 paddw m2, m5 10068 psubw m5, m6 10069 movaps m6, m3 10070 10071 vshufps m6, m6, m7, 11011101b 10072 vshufps m3, m3, m7, 10001000b 10073 10074 movdqa m7, m3 10075 paddw m3, m6 10076 psubw m6, m7 10077 movdqa m7, m0 10078 10079 pblendw m0, m9, 10101010b 10080 pslld m9, 10h 10081 psrld m7, 10h 10082 por m9, m7 10083 pabsw m0, m0 10084 pabsw m9, m9 10085 pmaxsw m0, m9 10086 movdqa m7, m1 10087 pblendw m1, m4, 10101010b 10088 pslld m4, 10h 10089 psrld m7, 10h 10090 por m4, m7 10091 pabsw m1, m1 10092 pabsw m4, m4 10093 pmaxsw m1, m4 10094 movdqa m7, m2 10095 pblendw m2, m5, 10101010b 10096 pslld m5, 10h 10097 psrld m7, 10h 10098 por m5, m7 10099 pabsw m2, m2 10100 pabsw m5, m5 10101 pmaxsw m2, m5 10102 mova m7, m3 10103 10104 pblendw m3, m6, 10101010b 10105 pslld m6, 10h 10106 psrld m7, 10h 10107 por m6, m7 10108 pabsw m3, m3 10109 pabsw m6, m6 10110 pmaxsw m3, m6 10111 paddw m0, m1 10112 paddw m0, m2 10113 paddw m0, m3 10114 pmaddwd m0, [pw_1] 10115 psrldq m1, m0, 8 10116 paddd m0, m1 10117 10118 pshuflw m1, m0, 00001110b 10119 paddd m0, m1 10120 paddd m0, [pd_1] 10121 psrld m0, 1 10122 10123 psubd m0, m11 10124 10125 vextracti128 xm1, m0, 1 10126 psubd m0, m1 10127 pabsd m0, m0 10128%endmacro 10129 10130%macro PSY_PP_8x8_AVX2 0 10131 lea r4, [r1 * 3] 10132 movu xm0, [r0] 10133 movu xm1, [r0 + r1] 10134 movu xm2, [r0 + r1 * 2] 10135 movu xm3, [r0 + r4] 10136 lea r5, [r0 + r1 * 4] 10137 movu xm4, [r5] 10138 movu xm5, [r5 + r1] 10139 movu xm6, [r5 + r1 * 2] 10140 movu xm7, [r5 + r4] 10141 10142 lea r4, [r3 * 3] 10143 vinserti128 m0, m0, [r2], 1 10144 vinserti128 m1, m1, [r2 + r3], 1 10145 vinserti128 m2, m2, [r2 + r3 * 2], 1 10146 vinserti128 m3, m3, [r2 + r4], 1 10147 lea r5, [r2 + r3 * 4] 10148 vinserti128 m4, m4, [r5], 1 10149 vinserti128 m5, m5, [r5 + r3], 1 10150 vinserti128 m6, m6, [r5 + r3 * 2], 1 10151 vinserti128 m7, m7, [r5 + r4], 1 10152 10153 paddw m8, m0, m1 10154 paddw m8, m2 10155 paddw m8, m3 10156 paddw m8, m4 10157 paddw m8, m5 10158 paddw m8, m6 10159 paddw m8, m7 10160 pmaddwd m8, [pw_1] 10161 10162 psrldq m9, m8, 8 10163 paddd m8, m9 10164 psrldq m9, m8, 4 10165 paddd m8, m9 10166 psrld m8, 2 10167 10168 psubw m9, m1, m0 10169 paddw m0, m1 10170 psubw m1, m3, m2 10171 paddw m2, m3 10172 punpckhwd m3, m0, m9 10173 punpcklwd m0, m9 10174 psubw m9, m3, m0 10175 paddw m0, m3 10176 punpckhwd m3, m2, m1 10177 punpcklwd m2, m1 10178 psubw m10, m3, m2 10179 paddw m2, m3 10180 psubw m3, m5, m4 10181 paddw m4, m5 10182 psubw m5, m7, m6 10183 paddw m6, m7 10184 punpckhwd m1, m4, m3 10185 punpcklwd m4, m3 10186 psubw m7, m1, m4 10187 paddw m4, m1 10188 punpckhwd m3, m6, m5 10189 punpcklwd m6, m5 10190 psubw m1, m3, m6 10191 paddw m6, m3 10192 psubw m3, m2, m0 10193 paddw m0, m2 10194 psubw m2, m10, m9 10195 paddw m9, m10 10196 punpckhdq m5, m0, m3 10197 punpckldq m0, m3 10198 psubw m10, m5, m0 10199 paddw m0, m5 10200 punpckhdq m3, m9, m2 10201 punpckldq m9, m2 10202 psubw m5, m3, m9 10203 paddw m9, m3 10204 psubw m3, m6, m4 10205 paddw m4, m6 10206 psubw m6, m1, m7 10207 paddw m7, m1 10208 punpckhdq m2, m4, m3 10209 punpckldq m4, m3 10210 psubw m1, m2, m4 10211 paddw m4, m2 10212 punpckhdq m3, m7, m6 10213 punpckldq m7, m6 10214 psubw m2, m3, m7 10215 paddw m7, m3 10216 psubw m3, m4, m0 10217 paddw m0, m4 10218 psubw m4, m1, m10 10219 paddw m10, m1 10220 punpckhqdq m6, m0, m3 10221 punpcklqdq m0, m3 10222 pabsw m0, m0 10223 pabsw m6, m6 10224 pmaxsw m0, m6 10225 punpckhqdq m3, m10, m4 10226 punpcklqdq m10, m4 10227 pabsw m10, m10 10228 pabsw m3, m3 10229 pmaxsw m10, m3 10230 psubw m3, m7, m9 10231 paddw m9, m7 10232 psubw m7, m2, m5 10233 paddw m5, m2 10234 punpckhqdq m4, m9, m3 10235 punpcklqdq m9, m3 10236 pabsw m9, m9 10237 pabsw m4, m4 10238 pmaxsw m9, m4 10239 punpckhqdq m3, m5, m7 10240 punpcklqdq m5, m7 10241 pabsw m5, m5 10242 pabsw m3, m3 10243 pmaxsw m5, m3 10244 paddd m0, m9 10245 paddd m0, m10 10246 paddd m0, m5 10247 psrld m9, m0, 16 10248 pslld m0, 16 10249 psrld m0, 16 10250 paddd m0, m9 10251 psrldq m9, m0, 8 10252 paddd m0, m9 10253 psrldq m9, m0, 4 10254 paddd m0, m9 10255 paddd m0, [pd_1] 10256 psrld m0, 1 10257 psubd m0, m8 10258 10259 vextracti128 xm1, m0, 1 10260 psubd xm1, xm0 10261 pabsd xm1, xm1 10262%endmacro 10263 10264%macro PSY_COST_PP_8x8_MAIN12 0 10265 ; load source pixels 10266 lea r4, [r1 * 3] 10267 pmovzxwd m0, [r0] 10268 pmovzxwd m1, [r0 + r1] 10269 pmovzxwd m2, [r0 + r1 * 2] 10270 pmovzxwd m3, [r0 + r4] 10271 lea r5, [r0 + r1 * 4] 10272 pmovzxwd m4, [r5] 10273 pmovzxwd m5, [r5 + r1] 10274 pmovzxwd m6, [r5 + r1 * 2] 10275 pmovzxwd m7, [r5 + r4] 10276 10277 ; source SAD 10278 paddd m8, m0, m1 10279 paddd m8, m2 10280 paddd m8, m3 10281 paddd m8, m4 10282 paddd m8, m5 10283 paddd m8, m6 10284 paddd m8, m7 10285 10286 vextracti128 xm9, m8, 1 10287 paddd m8, m9 ; sad_8x8 10288 movhlps xm9, xm8 10289 paddd xm8, xm9 10290 pshuflw xm9, xm8, 0Eh 10291 paddd xm8, xm9 10292 psrld m8, 2 10293 10294 ; source SA8D 10295 psubd m9, m1, m0 10296 paddd m0, m1 10297 psubd m1, m3, m2 10298 paddd m2, m3 10299 punpckhdq m3, m0, m9 10300 punpckldq m0, m9 10301 psubd m9, m3, m0 10302 paddd m0, m3 10303 punpckhdq m3, m2, m1 10304 punpckldq m2, m1 10305 psubd m10, m3, m2 10306 paddd m2, m3 10307 psubd m3, m5, m4 10308 paddd m4, m5 10309 psubd m5, m7, m6 10310 paddd m6, m7 10311 punpckhdq m1, m4, m3 10312 punpckldq m4, m3 10313 psubd m7, m1, m4 10314 paddd m4, m1 10315 punpckhdq m3, m6, m5 10316 punpckldq m6, m5 10317 psubd m1, m3, m6 10318 paddd m6, m3 10319 psubd m3, m2, m0 10320 paddd m0, m2 10321 psubd m2, m10, m9 10322 paddd m9, m10 10323 punpckhqdq m5, m0, m3 10324 punpcklqdq m0, m3 10325 psubd m10, m5, m0 10326 paddd m0, m5 10327 punpckhqdq m3, m9, m2 10328 punpcklqdq m9, m2 10329 psubd m5, m3, m9 10330 paddd m9, m3 10331 psubd m3, m6, m4 10332 paddd m4, m6 10333 psubd m6, m1, m7 10334 paddd m7, m1 10335 punpckhqdq m2, m4, m3 10336 punpcklqdq m4, m3 10337 psubd m1, m2, m4 10338 paddd m4, m2 10339 punpckhqdq m3, m7, m6 10340 punpcklqdq m7, m6 10341 psubd m2, m3, m7 10342 paddd m7, m3 10343 psubd m3, m4, m0 10344 paddd m0, m4 10345 psubd m4, m1, m10 10346 paddd m10, m1 10347 vinserti128 m6, m0, xm3, 1 10348 vperm2i128 m0, m0, m3, 00110001b 10349 pabsd m0, m0 10350 pabsd m6, m6 10351 pmaxsd m0, m6 10352 vinserti128 m3, m10, xm4, 1 10353 vperm2i128 m10, m10, m4, 00110001b 10354 pabsd m10, m10 10355 pabsd m3, m3 10356 pmaxsd m10, m3 10357 psubd m3, m7, m9 10358 paddd m9, m7 10359 psubd m7, m2, m5 10360 paddd m5, m2 10361 vinserti128 m4, m9, xm3, 1 10362 vperm2i128 m9, m9, m3, 00110001b 10363 pabsd m9, m9 10364 pabsd m4, m4 10365 pmaxsd m9, m4 10366 vinserti128 m3, m5, xm7, 1 10367 vperm2i128 m5, m5, m7, 00110001b 10368 pabsd m5, m5 10369 pabsd m3, m3 10370 pmaxsd m5, m3 10371 paddd m0, m9 10372 paddd m0, m10 10373 paddd m0, m5 10374 10375 vextracti128 xm9, m0, 1 10376 paddd m0, m9 ; sad_8x8 10377 movhlps xm9, xm0 10378 paddd xm0, xm9 10379 pshuflw xm9, xm0, 0Eh 10380 paddd xm0, xm9 10381 paddd m0, [pd_1] 10382 psrld m0, 1 ; sa8d_8x8 10383 psubd m11, m0, m8 ; sa8d_8x8 - sad_8x8 10384 10385 ; load recon pixels 10386 lea r4, [r3 * 3] 10387 pmovzxwd m0, [r2] 10388 pmovzxwd m1, [r2 + r3] 10389 pmovzxwd m2, [r2 + r3 * 2] 10390 pmovzxwd m3, [r2 + r4] 10391 lea r5, [r2 + r3 * 4] 10392 pmovzxwd m4, [r5] 10393 pmovzxwd m5, [r5 + r3] 10394 pmovzxwd m6, [r5 + r3 * 2] 10395 pmovzxwd m7, [r5 + r4] 10396 10397 ; recon SAD 10398 paddd m8, m0, m1 10399 paddd m8, m2 10400 paddd m8, m3 10401 paddd m8, m4 10402 paddd m8, m5 10403 paddd m8, m6 10404 paddd m8, m7 10405 10406 vextracti128 xm9, m8, 1 10407 paddd m8, m9 ; sad_8x8 10408 movhlps xm9, xm8 10409 paddd xm8, xm9 10410 pshuflw xm9, xm8, 0Eh 10411 paddd xm8, xm9 10412 psrld m8, 2 10413 10414 ; recon SA8D 10415 psubd m9, m1, m0 10416 paddd m0, m1 10417 psubd m1, m3, m2 10418 paddd m2, m3 10419 punpckhdq m3, m0, m9 10420 punpckldq m0, m9 10421 psubd m9, m3, m0 10422 paddd m0, m3 10423 punpckhdq m3, m2, m1 10424 punpckldq m2, m1 10425 psubd m10, m3, m2 10426 paddd m2, m3 10427 psubd m3, m5, m4 10428 paddd m4, m5 10429 psubd m5, m7, m6 10430 paddd m6, m7 10431 punpckhdq m1, m4, m3 10432 punpckldq m4, m3 10433 psubd m7, m1, m4 10434 paddd m4, m1 10435 punpckhdq m3, m6, m5 10436 punpckldq m6, m5 10437 psubd m1, m3, m6 10438 paddd m6, m3 10439 psubd m3, m2, m0 10440 paddd m0, m2 10441 psubd m2, m10, m9 10442 paddd m9, m10 10443 punpckhqdq m5, m0, m3 10444 punpcklqdq m0, m3 10445 psubd m10, m5, m0 10446 paddd m0, m5 10447 punpckhqdq m3, m9, m2 10448 punpcklqdq m9, m2 10449 psubd m5, m3, m9 10450 paddd m9, m3 10451 psubd m3, m6, m4 10452 paddd m4, m6 10453 psubd m6, m1, m7 10454 paddd m7, m1 10455 punpckhqdq m2, m4, m3 10456 punpcklqdq m4, m3 10457 psubd m1, m2, m4 10458 paddd m4, m2 10459 punpckhqdq m3, m7, m6 10460 punpcklqdq m7, m6 10461 psubd m2, m3, m7 10462 paddd m7, m3 10463 psubd m3, m4, m0 10464 paddd m0, m4 10465 psubd m4, m1, m10 10466 paddd m10, m1 10467 vinserti128 m6, m0, xm3, 1 10468 vperm2i128 m0, m0, m3, 00110001b 10469 pabsd m0, m0 10470 pabsd m6, m6 10471 pmaxsd m0, m6 10472 vinserti128 m3, m10, xm4, 1 10473 vperm2i128 m10, m10, m4, 00110001b 10474 pabsd m10, m10 10475 pabsd m3, m3 10476 pmaxsd m10, m3 10477 psubd m3, m7, m9 10478 paddd m9, m7 10479 psubd m7, m2, m5 10480 paddd m5, m2 10481 vinserti128 m4, m9, xm3, 1 10482 vperm2i128 m9, m9, m3, 00110001b 10483 pabsd m9, m9 10484 pabsd m4, m4 10485 pmaxsd m9, m4 10486 vinserti128 m3, m5, xm7, 1 10487 vperm2i128 m5, m5, m7, 00110001b 10488 pabsd m5, m5 10489 pabsd m3, m3 10490 pmaxsd m5, m3 10491 paddd m0, m9 10492 paddd m0, m10 10493 paddd m0, m5 10494 10495 vextracti128 xm9, m0, 1 10496 paddd m0, m9 ; sad_8x8 10497 movhlps xm9, xm0 10498 paddd xm0, xm9 10499 pshuflw xm9, xm0, 0Eh 10500 paddd xm0, xm9 10501 paddd m0, [pd_1] 10502 psrld m0, 1 ; sa8d_8x8 10503 psubd m0, m8 ; sa8d_8x8 - sad_8x8 10504 10505 psubd m11, m0 10506 pabsd m11, m11 10507%endmacro 10508 10509%macro PSY_COST_PP_8x8_AVX512_MAIN12 0 10510 ; load source and recon pixels 10511 lea r4, [r1 * 3] 10512 pmovzxwd ym0, [r0] 10513 pmovzxwd ym1, [r0 + r1] 10514 pmovzxwd ym2, [r0 + r1 * 2] 10515 pmovzxwd ym3, [r0 + r4] 10516 lea r5, [r0 + r1 * 4] 10517 pmovzxwd ym4, [r5] 10518 pmovzxwd ym5, [r5 + r1] 10519 pmovzxwd ym6, [r5 + r1 * 2] 10520 pmovzxwd ym7, [r5 + r4] 10521 10522 lea r4, [r3 * 3] 10523 pmovzxwd ym16, [r2] 10524 pmovzxwd ym17, [r2 + r3] 10525 pmovzxwd ym18, [r2 + r3 * 2] 10526 pmovzxwd ym19, [r2 + r4] 10527 lea r5, [r2 + r3 * 4] 10528 pmovzxwd ym20, [r5] 10529 pmovzxwd ym21, [r5 + r3] 10530 pmovzxwd ym22, [r5 + r3 * 2] 10531 pmovzxwd ym23, [r5 + r4] 10532 10533 vinserti64x4 m0, m0, ym16, 1 10534 vinserti64x4 m1, m1, ym17, 1 10535 vinserti64x4 m2, m2, ym18, 1 10536 vinserti64x4 m3, m3, ym19, 1 10537 vinserti64x4 m4, m4, ym20, 1 10538 vinserti64x4 m5, m5, ym21, 1 10539 vinserti64x4 m6, m6, ym22, 1 10540 vinserti64x4 m7, m7, ym23, 1 10541 10542 ; source + recon SAD 10543 paddd m8, m0, m1 10544 paddd m8, m2 10545 paddd m8, m3 10546 paddd m8, m4 10547 paddd m8, m5 10548 paddd m8, m6 10549 paddd m8, m7 10550 10551 vextracti64x4 ym15, m8, 1 10552 10553 vextracti128 xm9, ym8, 1 10554 paddd ym8, ym9 ; sad_8x8 10555 movhlps xm9, xm8 10556 paddd xm8, xm9 10557 pshuflw xm9, xm8, 0Eh 10558 paddd xm8, xm9 10559 psrld ym8, 2 10560 10561 vextracti128 xm9, ym15, 1 10562 paddd ym15, ym9 ; sad_8x8 10563 movhlps xm9, xm15 10564 paddd xm15, xm9 10565 pshuflw xm9, xm15, 0Eh 10566 paddd xm15, xm9 10567 psrld ym15, 2 10568 10569 ; source and recon SA8D 10570 psubd m9, m1, m0 10571 paddd m0, m1 10572 psubd m1, m3, m2 10573 paddd m2, m3 10574 punpckhdq m3, m0, m9 10575 punpckldq m0, m9 10576 psubd m9, m3, m0 10577 paddd m0, m3 10578 punpckhdq m3, m2, m1 10579 punpckldq m2, m1 10580 psubd m10, m3, m2 10581 paddd m2, m3 10582 psubd m3, m5, m4 10583 paddd m4, m5 10584 psubd m5, m7, m6 10585 paddd m6, m7 10586 punpckhdq m1, m4, m3 10587 punpckldq m4, m3 10588 psubd m7, m1, m4 10589 paddd m4, m1 10590 punpckhdq m3, m6, m5 10591 punpckldq m6, m5 10592 psubd m1, m3, m6 10593 paddd m6, m3 10594 psubd m3, m2, m0 10595 paddd m0, m2 10596 psubd m2, m10, m9 10597 paddd m9, m10 10598 punpckhqdq m5, m0, m3 10599 punpcklqdq m0, m3 10600 psubd m10, m5, m0 10601 paddd m0, m5 10602 punpckhqdq m3, m9, m2 10603 punpcklqdq m9, m2 10604 psubd m5, m3, m9 10605 paddd m9, m3 10606 psubd m3, m6, m4 10607 paddd m4, m6 10608 psubd m6, m1, m7 10609 paddd m7, m1 10610 punpckhqdq m2, m4, m3 10611 punpcklqdq m4, m3 10612 psubd m1, m2, m4 10613 paddd m4, m2 10614 punpckhqdq m3, m7, m6 10615 punpcklqdq m7, m6 10616 10617 psubd m2, m3, m7 10618 paddd m7, m3 10619 psubd m3, m4, m0 10620 paddd m0, m4 10621 psubd m4, m1, m10 10622 paddd m10, m1 10623 10624 mova m16, m13 10625 mova m17, m14 10626 vpermi2q m16, m0, m3 10627 vpermi2q m17, m0, m3 10628 10629 pabsd m17, m17 10630 pabsd m16, m16 10631 pmaxsd m17, m16 10632 10633 mova m18, m13 10634 mova m19, m14 10635 vpermi2q m18, m10, m4 10636 vpermi2q m19, m10, m4 10637 10638 pabsd m19, m19 10639 pabsd m18, m18 10640 pmaxsd m19, m18 10641 psubd m18, m7, m9 10642 paddd m9, m7 10643 psubd m7, m2, m5 10644 paddd m5, m2 10645 10646 mova m20, m13 10647 mova m21, m14 10648 vpermi2q m20, m9, m18 10649 vpermi2q m21, m9, m18 10650 10651 pabsd m21, m21 10652 pabsd m20, m20 10653 pmaxsd m21, m20 10654 10655 mova m22, m13 10656 mova m23, m14 10657 vpermi2q m22, m5, m7 10658 vpermi2q m23, m5, m7 10659 10660 pabsd m23, m23 10661 pabsd m22, m22 10662 pmaxsd m23, m22 10663 paddd m17, m21 10664 paddd m17, m19 10665 paddd m17, m23 10666 10667 vextracti64x4 ym26, m17, 1 10668 10669 vextracti128 xm9, m17, 1 10670 paddd ym17, ym9 ; sad_8x8 10671 movhlps xm9, xm17 10672 paddd xm17, xm9 10673 pshuflw xm9, xm17, 0Eh 10674 paddd xm17, xm9 10675 paddd ym17, [pd_1] 10676 psrld ym17, 1 ; sa8d_8x8 10677 10678 vextracti128 xm9, ym26, 1 10679 paddd ym26, ym9 ; sad_8x8 10680 movhlps xm9, xm26 10681 paddd xm26, xm9 10682 pshuflw xm9, xm26, 0Eh 10683 paddd xm26, xm9 10684 paddd ym26, [pd_1] 10685 psrld ym26, 1 ; sa8d_8x8 10686 10687 10688 10689 psubd ym11, ym17, ym8 ; sa8d_8x8 - sad_8x8 10690 psubd ym12, ym26, ym15 ; sa8d_8x8 - sad_8x8 10691 10692 psubd ym11, ym12 10693 pabsd ym11, ym11 10694%endmacro 10695 10696%macro PSY_PP_INPUT_AVX512_MAIN10 0 10697 lea r4, [r1 * 3] 10698 movu xm0, [r0] 10699 movu xm1, [r0 + r1] 10700 movu xm2, [r0 + r1 * 2] 10701 movu xm3, [r0 + r4] 10702 lea r5, [r0 + r1 * 4] 10703 movu xm4, [r5] 10704 movu xm5, [r5 + r1] 10705 movu xm6, [r5 + r1 * 2] 10706 movu xm7, [r5 + r4] 10707 10708 lea r4, [r3 * 3] 10709 vinserti128 ym0, ym0, [r2], 1 10710 vinserti128 ym1, ym1, [r2 + r3], 1 10711 vinserti128 ym2, ym2, [r2 + r3 * 2], 1 10712 vinserti128 ym3, ym3, [r2 + r4], 1 10713 lea r5, [r2 + r3 * 4] 10714 vinserti128 ym4, ym4, [r5], 1 10715 vinserti128 ym5, ym5, [r5 + r3], 1 10716 vinserti128 ym6, ym6, [r5 + r3 * 2], 1 10717 vinserti128 ym7, ym7, [r5 + r4], 1 10718 10719 add r0, 16 10720 add r2, 16 10721 10722 lea r4, [r1 * 3] 10723 vinserti32x4 m0, m0, [r0], 2 10724 vinserti32x4 m1, m1, [r0 + r1], 2 10725 vinserti32x4 m2, m2, [r0 + r1 * 2], 2 10726 vinserti32x4 m3, m3, [r0 + r4], 2 10727 lea r5, [r0 + r1 * 4] 10728 vinserti32x4 m4, m4, [r5], 2 10729 vinserti32x4 m5, m5, [r5 + r1], 2 10730 vinserti32x4 m6, m6, [r5 + r1 * 2], 2 10731 vinserti32x4 m7, m7, [r5 + r4], 2 10732 10733 lea r4, [r3 * 3] 10734 vinserti32x4 m0, m0, [r2], 3 10735 vinserti32x4 m1, m1, [r2 + r3], 3 10736 vinserti32x4 m2, m2, [r2 + r3 * 2], 3 10737 vinserti32x4 m3, m3, [r2 + r4], 3 10738 lea r5, [r2 + r3 * 4] 10739 vinserti32x4 m4, m4, [r5], 3 10740 vinserti32x4 m5, m5, [r5 + r3], 3 10741 vinserti32x4 m6, m6, [r5 + r3 * 2], 3 10742 vinserti32x4 m7, m7, [r5 + r4], 3 10743%endmacro 10744 10745 10746%macro PSY_PP_16x8_AVX512_MAIN10 0 10747 paddw m8, m0, m1 10748 paddw m8, m2 10749 paddw m8, m3 10750 paddw m8, m4 10751 paddw m8, m5 10752 paddw m8, m6 10753 paddw m8, m7 10754 pmaddwd m8, m14 10755 10756 psrldq m9, m8, 8 10757 paddd m8, m9 10758 psrldq m9, m8, 4 10759 paddd m8, m9 10760 psrld m8, 2 10761 10762 psubw m9, m1, m0 10763 paddw m0, m1 10764 psubw m1, m3, m2 10765 paddw m2, m3 10766 punpckhwd m3, m0, m9 10767 punpcklwd m0, m9 10768 psubw m9, m3, m0 10769 paddw m0, m3 10770 punpckhwd m3, m2, m1 10771 punpcklwd m2, m1 10772 psubw m10, m3, m2 10773 paddw m2, m3 10774 10775 psubw m3, m5, m4 10776 paddw m4, m5 10777 psubw m5, m7, m6 10778 paddw m6, m7 10779 punpckhwd m1, m4, m3 10780 punpcklwd m4, m3 10781 psubw m7, m1, m4 10782 paddw m4, m1 10783 punpckhwd m3, m6, m5 10784 punpcklwd m6, m5 10785 psubw m1, m3, m6 10786 paddw m6, m3 10787 10788 psubw m3, m2, m0 10789 paddw m0, m2 10790 psubw m2, m10, m9 10791 paddw m9, m10 10792 punpckhdq m5, m0, m3 10793 punpckldq m0, m3 10794 psubw m10, m5, m0 10795 paddw m0, m5 10796 punpckhdq m3, m9, m2 10797 punpckldq m9, m2 10798 psubw m5, m3, m9 10799 paddw m9, m3 10800 10801 psubw m3, m6, m4 10802 paddw m4, m6 10803 psubw m6, m1, m7 10804 paddw m7, m1 10805 punpckhdq m2, m4, m3 10806 punpckldq m4, m3 10807 psubw m1, m2, m4 10808 paddw m4, m2 10809 punpckhdq m3, m7, m6 10810 punpckldq m7, m6 10811 psubw m2, m3, m7 10812 paddw m7, m3 10813 10814 psubw m3, m4, m0 10815 paddw m0, m4 10816 psubw m4, m1, m10 10817 paddw m10, m1 10818 punpckhqdq m6, m0, m3 10819 punpcklqdq m0, m3 10820 pabsw m0, m0 10821 pabsw m6, m6 10822 pmaxsw m0, m6 10823 punpckhqdq m3, m10, m4 10824 punpcklqdq m10, m4 10825 pabsw m10, m10 10826 pabsw m3, m3 10827 pmaxsw m10, m3 10828 10829 psubw m3, m7, m9 10830 paddw m9, m7 10831 psubw m7, m2, m5 10832 paddw m5, m2 10833 punpckhqdq m4, m9, m3 10834 punpcklqdq m9, m3 10835 pabsw m9, m9 10836 pabsw m4, m4 10837 pmaxsw m9, m4 10838 punpckhqdq m3, m5, m7 10839 punpcklqdq m5, m7 10840 pabsw m5, m5 10841 pabsw m3, m3 10842 pmaxsw m5, m3 10843 10844 paddd m0, m9 10845 paddd m0, m10 10846 paddd m0, m5 10847 psrld m9, m0, 16 10848 pslld m0, 16 10849 psrld m0, 16 10850 paddd m0, m9 10851 psrldq m9, m0, 8 10852 paddd m0, m9 10853 psrldq m9, m0, 4 10854 paddd m0, m9 10855 paddd m0, m15 10856 psrld m0, 1 10857 psubd m0, m8 10858 10859 vextracti64x4 ym2, m0, 1 10860 10861 vextracti128 xm3, ym2, 1 10862 psubd xm3, xm2 10863 pabsd xm3, xm3 10864 10865 vextracti128 xm1, ym0, 1 10866 psubd xm1, xm0 10867 pabsd xm1, xm1 10868 paddd xm1, xm3 10869%endmacro 10870 10871%macro PSY_PP_INPUT_AVX512_MAIN 0 10872 movu xm16, [r0 + r1 * 0] 10873 movu xm17, [r0 + r1 * 1] 10874 movu xm18, [r0 + r1 * 2] 10875 movu xm19, [r0 + r4 * 1] 10876 10877 movu xm20, [r2 + r3 * 0] 10878 movu xm21, [r2 + r3 * 1] 10879 movu xm22, [r2 + r3 * 2] 10880 movu xm23, [r2 + r7 * 1] 10881 10882 mova m0, m26 10883 vpermi2q m0, m16, m20 10884 mova m1, m26 10885 vpermi2q m1, m17, m21 10886 mova m2, m26 10887 vpermi2q m2, m18, m22 10888 mova m3, m26 10889 vpermi2q m3, m19, m23 10890 10891 10892 lea r5, [r0 + r1 * 4] 10893 lea r6, [r2 + r3 * 4] 10894 10895 movu xm16, [r5 + r1 * 0] 10896 movu xm17, [r5 + r1 * 1] 10897 movu xm18, [r5 + r1 * 2] 10898 movu xm19, [r5 + r4 * 1] 10899 10900 movu xm20, [r6 + r3 * 0] 10901 movu xm21, [r6 + r3 * 1] 10902 movu xm22, [r6 + r3 * 2] 10903 movu xm23, [r6 + r7 * 1] 10904 10905 mova m4, m26 10906 vpermi2q m4, m16, m20 10907 mova m5, m26 10908 vpermi2q m5, m17, m21 10909 mova m6, m26 10910 vpermi2q m6, m18, m22 10911 mova m7, m26 10912 vpermi2q m7, m19, m23 10913%endmacro 10914 10915%macro PSY_PP_16x8_AVX512_MAIN 0 10916 pmaddubsw m0, m8 10917 pmaddubsw m1, m8 10918 pmaddubsw m2, m8 10919 pmaddubsw m3, m8 10920 pmaddubsw m4, m8 10921 pmaddubsw m5, m8 10922 pmaddubsw m6, m8 10923 pmaddubsw m7, m8 10924 10925 paddw m11, m0, m1 10926 paddw m11, m2 10927 paddw m11, m3 10928 paddw m11, m4 10929 paddw m11, m5 10930 paddw m11, m6 10931 paddw m11, m7 10932 10933 pmaddwd m11, m14 10934 psrldq m10, m11, 4 10935 paddd m11, m10 10936 psrld m11, 2 10937 10938 mova m9, m0 10939 paddw m0, m1 10940 psubw m1, m9 10941 mova m9, m2 10942 paddw m2, m3 10943 psubw m3, m9 10944 mova m9, m0 10945 paddw m0, m2 10946 psubw m2, m9 10947 mova m9, m1 10948 paddw m1, m3 10949 psubw m3, m9 10950 10951 movdqa m9, m4 10952 paddw m4, m5 10953 psubw m5, m9 10954 movdqa m9, m6 10955 paddw m6, m7 10956 psubw m7, m9 10957 movdqa m9, m4 10958 paddw m4, m6 10959 psubw m6, m9 10960 movdqa m9, m5 10961 paddw m5, m7 10962 psubw m7, m9 10963 10964 movdqa m9, m0 10965 paddw m0, m4 10966 psubw m4, m9 10967 movdqa m9, m1 10968 paddw m1, m5 10969 psubw m5, m9 10970 10971 mova m9, m0 10972 vshufps m9, m9, m4, 11011101b 10973 vshufps m0, m0, m4, 10001000b 10974 10975 movdqa m4, m0 10976 paddw m16, m0, m9 10977 psubw m17, m9, m4 10978 10979 movaps m4, m1 10980 vshufps m4, m4, m5, 11011101b 10981 vshufps m1, m1, m5, 10001000b 10982 10983 movdqa m5, m1 10984 paddw m18, m1, m4 10985 psubw m19, m4, m5 10986 10987 movdqa m5, m2 10988 paddw m2, m6 10989 psubw m6, m5 10990 movdqa m5, m3 10991 paddw m3, m7 10992 psubw m7, m5 10993 10994 movaps m5, m2 10995 vshufps m5, m5, m6, 11011101b 10996 vshufps m2, m2, m6, 10001000b 10997 10998 movdqa m6, m2 10999 paddw m20, m2, m5 11000 psubw m21, m5, m6 11001 11002 movaps m6, m3 11003 11004 vshufps m6, m6, m7, 11011101b 11005 vshufps m3, m3, m7, 10001000b 11006 11007 movdqa m7, m3 11008 paddw m22, m3, m6 11009 psubw m23, m6, m7 11010 11011 movdqa m7, m16 11012 11013 vextracti64x4 ym24, m16, 1 11014 vextracti64x4 ym25, m17, 1 11015 pblendw ym16, ym17, 10101010b 11016 pblendw ym24, ym25, 10101010b 11017 vinserti64x4 m16, m16, ym24, 1 11018 11019 pslld m17, 10h 11020 psrld m7, 10h 11021 por m17, m7 11022 pabsw m16, m16 11023 pabsw m17, m17 11024 pmaxsw m16, m17 11025 movdqa m7, m18 11026 11027 vextracti64x4 ym24, m18, 1 11028 vextracti64x4 ym25, m19, 1 11029 pblendw ym18, ym19, 10101010b 11030 pblendw ym24, ym25, 10101010b 11031 vinserti64x4 m18, m18, ym24, 1 11032 11033 pslld m19, 10h 11034 psrld m7, 10h 11035 por m19, m7 11036 pabsw m18, m18 11037 pabsw m19, m19 11038 pmaxsw m18, m19 11039 movdqa m7, m20 11040 11041 vextracti64x4 ym24, m20, 1 11042 vextracti64x4 ym25, m21, 1 11043 pblendw ym20, ym21, 10101010b 11044 pblendw ym24, ym25, 10101010b 11045 vinserti64x4 m20, m20, ym24, 1 11046 11047 pslld m21, 10h 11048 psrld m7, 10h 11049 por m21, m7 11050 pabsw m20, m20 11051 pabsw m21, m21 11052 pmaxsw m20, m21 11053 mova m7, m22 11054 11055 vextracti64x4 ym24, m22, 1 11056 vextracti64x4 ym25, m23, 1 11057 pblendw ym22, ym23, 10101010b 11058 pblendw ym24, ym25, 10101010b 11059 vinserti64x4 m22, m22, ym24, 1 11060 11061 pslld m23, 10h 11062 psrld m7, 10h 11063 por m23, m7 11064 pabsw m22, m22 11065 pabsw m23, m23 11066 pmaxsw m22, m23 11067 paddw m16, m18 11068 paddw m16, m20 11069 paddw m16, m22 11070 pmaddwd m16, m14 11071 psrldq m1, m16, 8 11072 paddd m16, m1 11073 11074 pshuflw m1, m16, 00001110b 11075 paddd m16, m1 11076 paddd m16, m15 11077 psrld m16, 1 11078 11079 psubd m16, m11 11080 vextracti64x4 ym2, m16, 1 11081 11082 vextracti128 xm1, ym16, 1 11083 psubd xm16, xm1 11084 pabsd xm16, xm16 11085 11086 vextracti128 xm3, ym2, 1 11087 psubd xm3, xm2 11088 pabsd xm3, xm3 11089 paddd xm16, xm3 11090%endmacro 11091 11092 11093%if ARCH_X86_64 11094INIT_YMM avx2 11095%if HIGH_BIT_DEPTH && BIT_DEPTH == 12 11096cglobal psyCost_pp_8x8, 4, 8, 12 11097 add r1d, r1d 11098 add r3d, r3d 11099 PSY_COST_PP_8x8_MAIN12 11100 movd eax, xm11 11101 RET 11102%endif 11103 11104%if HIGH_BIT_DEPTH && BIT_DEPTH == 10 11105cglobal psyCost_pp_8x8, 4, 8, 11 11106 add r1d, r1d 11107 add r3d, r3d 11108 PSY_PP_8x8_AVX2 11109 movd eax, xm1 11110 RET 11111%endif 11112 11113%if BIT_DEPTH == 8 11114cglobal psyCost_pp_8x8, 4, 8, 13 11115 lea r4, [3 * r1] 11116 lea r7, [3 * r3] 11117 mova m8, [hmul_8p] 11118 11119 PSY_PP_8x8 11120 11121 movd eax, xm0 11122 RET 11123%endif 11124%endif 11125 11126%if ARCH_X86_64 11127INIT_YMM avx2 11128%if HIGH_BIT_DEPTH && BIT_DEPTH == 12 11129cglobal psyCost_pp_16x16, 4, 10, 13 11130 add r1d, r1d 11131 add r3d, r3d 11132 pxor m12, m12 11133 11134 mov r8d, 2 11135.loopH: 11136 mov r9d, 2 11137.loopW: 11138 PSY_COST_PP_8x8_MAIN12 11139 11140 paddd xm12, xm11 11141 add r0, 16 11142 add r2, 16 11143 dec r9d 11144 jnz .loopW 11145 lea r0, [r0 + r1 * 8 - 32] 11146 lea r2, [r2 + r3 * 8 - 32] 11147 dec r8d 11148 jnz .loopH 11149 movd eax, xm12 11150 RET 11151%endif 11152 11153%if HIGH_BIT_DEPTH && BIT_DEPTH == 10 11154cglobal psyCost_pp_16x16, 4, 10, 12 11155 add r1d, r1d 11156 add r3d, r3d 11157 pxor m11, m11 11158 11159 mov r8d, 2 11160.loopH: 11161 mov r9d, 2 11162.loopW: 11163 PSY_PP_8x8_AVX2 11164 11165 paddd xm11, xm1 11166 add r0, 16 11167 add r2, 16 11168 dec r9d 11169 jnz .loopW 11170 lea r0, [r0 + r1 * 8 - 32] 11171 lea r2, [r2 + r3 * 8 - 32] 11172 dec r8d 11173 jnz .loopH 11174 movd eax, xm11 11175 RET 11176%endif 11177 11178%if BIT_DEPTH == 8 11179cglobal psyCost_pp_16x16, 4, 10, 14 11180 lea r4, [3 * r1] 11181 lea r7, [3 * r3] 11182 mova m8, [hmul_8p] 11183 pxor m13, m13 11184 11185 mov r8d, 2 11186.loopH: 11187 mov r9d, 2 11188.loopW: 11189 PSY_PP_8x8 11190 11191 paddd m13, m0 11192 add r0, 8 11193 add r2, 8 11194 dec r9d 11195 jnz .loopW 11196 lea r0, [r0 + r1 * 8 - 16] 11197 lea r2, [r2 + r3 * 8 - 16] 11198 dec r8d 11199 jnz .loopH 11200 movd eax, xm13 11201 RET 11202%endif 11203%endif 11204 11205%if ARCH_X86_64 11206INIT_YMM avx2 11207%if HIGH_BIT_DEPTH && BIT_DEPTH == 12 11208cglobal psyCost_pp_32x32, 4, 10, 13 11209 add r1d, r1d 11210 add r3d, r3d 11211 pxor m12, m12 11212 11213 mov r8d, 4 11214.loopH: 11215 mov r9d, 4 11216.loopW: 11217 PSY_COST_PP_8x8_MAIN12 11218 11219 paddd xm12, xm11 11220 add r0, 16 11221 add r2, 16 11222 dec r9d 11223 jnz .loopW 11224 lea r0, [r0 + r1 * 8 - 64] 11225 lea r2, [r2 + r3 * 8 - 64] 11226 dec r8d 11227 jnz .loopH 11228 movd eax, xm12 11229 RET 11230%endif 11231 11232%if HIGH_BIT_DEPTH && BIT_DEPTH == 10 11233cglobal psyCost_pp_32x32, 4, 10, 12 11234 add r1d, r1d 11235 add r3d, r3d 11236 pxor m11, m11 11237 11238 mov r8d, 4 11239.loopH: 11240 mov r9d, 4 11241.loopW: 11242 PSY_PP_8x8_AVX2 11243 11244 paddd xm11, xm1 11245 add r0, 16 11246 add r2, 16 11247 dec r9d 11248 jnz .loopW 11249 lea r0, [r0 + r1 * 8 - 64] 11250 lea r2, [r2 + r3 * 8 - 64] 11251 dec r8d 11252 jnz .loopH 11253 movd eax, xm11 11254 RET 11255%endif 11256 11257%if BIT_DEPTH == 8 11258cglobal psyCost_pp_32x32, 4, 10, 14 11259 lea r4, [3 * r1] 11260 lea r7, [3 * r3] 11261 mova m8, [hmul_8p] 11262 pxor m13, m13 11263 11264 mov r8d, 4 11265.loopH: 11266 mov r9d, 4 11267.loopW: 11268 PSY_PP_8x8 11269 11270 paddd m13, m0 11271 add r0, 8 11272 add r2, 8 11273 dec r9d 11274 jnz .loopW 11275 lea r0, [r0 + r1 * 8 - 32] 11276 lea r2, [r2 + r3 * 8 - 32] 11277 dec r8d 11278 jnz .loopH 11279 movd eax, xm13 11280 RET 11281%endif 11282%endif 11283 11284%if ARCH_X86_64 11285INIT_YMM avx2 11286%if HIGH_BIT_DEPTH && BIT_DEPTH == 12 11287cglobal psyCost_pp_64x64, 4, 10, 13 11288 add r1d, r1d 11289 add r3d, r3d 11290 pxor m12, m12 11291 11292 mov r8d, 8 11293.loopH: 11294 mov r9d, 8 11295.loopW: 11296 PSY_COST_PP_8x8_MAIN12 11297 11298 paddd xm12, xm11 11299 add r0, 16 11300 add r2, 16 11301 dec r9d 11302 jnz .loopW 11303 lea r0, [r0 + r1 * 8 - 128] 11304 lea r2, [r2 + r3 * 8 - 128] 11305 dec r8d 11306 jnz .loopH 11307 movd eax, xm12 11308 RET 11309%endif 11310 11311%if HIGH_BIT_DEPTH && BIT_DEPTH == 10 11312cglobal psyCost_pp_64x64, 4, 10, 12 11313 add r1d, r1d 11314 add r3d, r3d 11315 pxor m11, m11 11316 11317 mov r8d, 8 11318.loopH: 11319 mov r9d, 8 11320.loopW: 11321 PSY_PP_8x8_AVX2 11322 11323 paddd xm11, xm1 11324 add r0, 16 11325 add r2, 16 11326 dec r9d 11327 jnz .loopW 11328 lea r0, [r0 + r1 * 8 - 128] 11329 lea r2, [r2 + r3 * 8 - 128] 11330 dec r8d 11331 jnz .loopH 11332 movd eax, xm11 11333 RET 11334%endif 11335 11336%if BIT_DEPTH == 8 11337cglobal psyCost_pp_64x64, 4, 10, 14 11338 lea r4, [3 * r1] 11339 lea r7, [3 * r3] 11340 mova m8, [hmul_8p] 11341 pxor m13, m13 11342 11343 mov r8d, 8 11344.loopH: 11345 mov r9d, 8 11346.loopW: 11347 PSY_PP_8x8 11348 11349 paddd m13, m0 11350 add r0, 8 11351 add r2, 8 11352 dec r9d 11353 jnz .loopW 11354 lea r0, [r0 + r1 * 8 - 64] 11355 lea r2, [r2 + r3 * 8 - 64] 11356 dec r8d 11357 jnz .loopH 11358 movd eax, xm13 11359 RET 11360%endif 11361%endif 11362%if ARCH_X86_64 11363INIT_ZMM avx512 11364%if HIGH_BIT_DEPTH && BIT_DEPTH == 12 11365cglobal psyCost_pp_16x16, 4, 10, 27 11366 add r1d, r1d 11367 add r3d, r3d 11368 pxor m24, m24 11369 movu m13, [psy_pp_shuff1] 11370 movu m14, [psy_pp_shuff2] 11371 11372 mov r8d, 2 11373.loopH: 11374 mov r9d, 2 11375.loopW: 11376 PSY_COST_PP_8x8_AVX512_MAIN12 11377 11378 paddd xm24, xm11 11379 add r0, 16 11380 add r2, 16 11381 dec r9d 11382 jnz .loopW 11383 lea r0, [r0 + r1 * 8 - 32] 11384 lea r2, [r2 + r3 * 8 - 32] 11385 dec r8d 11386 jnz .loopH 11387 movd eax, xm24 11388 RET 11389%endif 11390 11391%if HIGH_BIT_DEPTH && BIT_DEPTH == 10 11392cglobal psyCost_pp_16x16, 4, 10, 16 11393 add r1d, r1d 11394 add r3d, r3d 11395 pxor m11, m11 11396 vbroadcasti32x8 m14, [pw_1] 11397 vbroadcasti32x8 m15, [pd_1] 11398 11399 mov r8d, 2 11400.loopH: 11401 PSY_PP_INPUT_AVX512_MAIN10 11402 PSY_PP_16x8_AVX512_MAIN10 11403 11404 paddd xm11, xm1 11405 lea r0, [r0 + r1 * 8 - 16] 11406 lea r2, [r2 + r3 * 8 - 16] 11407 dec r8d 11408 jnz .loopH 11409 movd eax, xm11 11410 RET 11411%endif 11412 11413%if BIT_DEPTH == 8 11414cglobal psyCost_pp_16x16, 4, 10, 27 11415 lea r4, [3 * r1] 11416 lea r7, [3 * r3] 11417 vbroadcasti32x8 m8, [hmul_8p] 11418 pxor m13, m13 11419 vbroadcasti32x8 m14, [pw_1] 11420 vbroadcasti32x8 m15, [pd_1] 11421 movu m26, [psy_pp_shuff3] 11422 11423 mov r8d, 2 11424.loopH: 11425 PSY_PP_INPUT_AVX512_MAIN 11426 PSY_PP_16x8_AVX512_MAIN 11427 11428 paddd m13, m16 11429 lea r0, [r0 + r1 * 8] 11430 lea r2, [r2 + r3 * 8] 11431 dec r8d 11432 jnz .loopH 11433 movd eax, xm13 11434 RET 11435%endif 11436%endif 11437 11438%if ARCH_X86_64 11439INIT_ZMM avx512 11440%if HIGH_BIT_DEPTH && BIT_DEPTH == 12 11441cglobal psyCost_pp_32x32, 4, 10, 27 11442 add r1d, r1d 11443 add r3d, r3d 11444 pxor m24, m24 11445 movu m13, [psy_pp_shuff1] 11446 movu m14, [psy_pp_shuff2] 11447 11448 mov r8d, 4 11449.loopH: 11450 mov r9d, 4 11451.loopW: 11452 PSY_COST_PP_8x8_AVX512_MAIN12 11453 11454 paddd xm24, xm11 11455 add r0, 16 11456 add r2, 16 11457 dec r9d 11458 jnz .loopW 11459 lea r0, [r0 + r1 * 8 - 64] 11460 lea r2, [r2 + r3 * 8 - 64] 11461 dec r8d 11462 jnz .loopH 11463 movd eax, xm24 11464 RET 11465%endif 11466 11467%if HIGH_BIT_DEPTH && BIT_DEPTH == 10 11468cglobal psyCost_pp_32x32, 4, 10, 16 11469 add r1d, r1d 11470 add r3d, r3d 11471 pxor m11, m11 11472 vbroadcasti32x8 m14, [pw_1] 11473 vbroadcasti32x8 m15, [pd_1] 11474 11475 mov r8d, 4 11476.loopH: 11477 mov r9d, 2 11478.loopW: 11479 PSY_PP_INPUT_AVX512_MAIN10 11480 PSY_PP_16x8_AVX512_MAIN10 11481 11482 paddd xm11, xm1 11483 add r0, 16 11484 add r2, 16 11485 dec r9d 11486 jnz .loopW 11487 lea r0, [r0 + r1 * 8 - 64] 11488 lea r2, [r2 + r3 * 8 - 64] 11489 dec r8d 11490 jnz .loopH 11491 movd eax, xm11 11492 RET 11493%endif 11494 11495%if BIT_DEPTH == 8 11496cglobal psyCost_pp_32x32, 4, 10, 27 11497 lea r4, [3 * r1] 11498 lea r7, [3 * r3] 11499 vbroadcasti32x8 m8, [hmul_8p] 11500 pxor m13, m13 11501 vbroadcasti32x8 m14, [pw_1] 11502 vbroadcasti32x8 m15, [pd_1] 11503 movu m26, [psy_pp_shuff3] 11504 11505 mov r8d, 4 11506.loopH: 11507 mov r9d, 2 11508.loopW: 11509 PSY_PP_INPUT_AVX512_MAIN 11510 PSY_PP_16x8_AVX512_MAIN 11511 11512 paddd m13, m16 11513 add r0, 16 11514 add r2, 16 11515 dec r9d 11516 jnz .loopW 11517 lea r0, [r0 + r1 * 8 - 32] 11518 lea r2, [r2 + r3 * 8 - 32] 11519 dec r8d 11520 jnz .loopH 11521 movd eax, xm13 11522 RET 11523%endif 11524%endif 11525 11526%if ARCH_X86_64 11527INIT_ZMM avx512 11528%if HIGH_BIT_DEPTH && BIT_DEPTH == 12 11529cglobal psyCost_pp_64x64, 4, 10, 27 11530 add r1d, r1d 11531 add r3d, r3d 11532 pxor m24, m24 11533 movu m13, [psy_pp_shuff1] 11534 movu m14, [psy_pp_shuff2] 11535 11536 mov r8d, 8 11537.loopH: 11538 mov r9d, 8 11539.loopW: 11540 PSY_COST_PP_8x8_AVX512_MAIN12 11541 11542 paddd xm24, xm11 11543 add r0, 16 11544 add r2, 16 11545 dec r9d 11546 jnz .loopW 11547 lea r0, [r0 + r1 * 8 - 128] 11548 lea r2, [r2 + r3 * 8 - 128] 11549 dec r8d 11550 jnz .loopH 11551 movd eax, xm24 11552 RET 11553%endif 11554 11555%if HIGH_BIT_DEPTH && BIT_DEPTH == 10 11556cglobal psyCost_pp_64x64, 4, 10, 16 11557 add r1d, r1d 11558 add r3d, r3d 11559 pxor m11, m11 11560 vbroadcasti32x8 m14, [pw_1] 11561 vbroadcasti32x8 m15, [pd_1] 11562 11563 mov r8d, 8 11564.loopH: 11565 mov r9d, 4 11566.loopW: 11567 PSY_PP_INPUT_AVX512_MAIN10 11568 PSY_PP_16x8_AVX512_MAIN10 11569 11570 paddd xm11, xm1 11571 add r0, 16 11572 add r2, 16 11573 dec r9d 11574 jnz .loopW 11575 lea r0, [r0 + r1 * 8 - 128] 11576 lea r2, [r2 + r3 * 8 - 128] 11577 dec r8d 11578 jnz .loopH 11579 movd eax, xm11 11580 RET 11581%endif 11582 11583%if BIT_DEPTH == 8 11584cglobal psyCost_pp_64x64, 4, 10, 27 11585 lea r4, [3 * r1] 11586 lea r7, [3 * r3] 11587 vbroadcasti32x8 m8, [hmul_8p] 11588 pxor m13, m13 11589 vbroadcasti32x8 m14, [pw_1] 11590 vbroadcasti32x8 m15, [pd_1] 11591 movu m26, [psy_pp_shuff3] 11592 11593 mov r8d, 8 11594.loopH: 11595 mov r9d, 4 11596.loopW: 11597 PSY_PP_INPUT_AVX512_MAIN 11598 PSY_PP_16x8_AVX512_MAIN 11599 11600 paddd m13, m16 11601 add r0, 16 11602 add r2, 16 11603 dec r9d 11604 jnz .loopW 11605 lea r0, [r0 + r1 * 8 - 64] 11606 lea r2, [r2 + r3 * 8 - 64] 11607 dec r8d 11608 jnz .loopH 11609 movd eax, xm13 11610 RET 11611%endif 11612%endif 11613 11614;--------------------------------------------------------------------------------------------------------------------- 11615;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) 11616;--------------------------------------------------------------------------------------------------------------------- 11617INIT_XMM sse4 11618cglobal psyCost_ss_4x4, 4, 5, 8 11619 11620 add r1, r1 11621 lea r4, [3 * r1] 11622 movddup m0, [r0] 11623 movddup m1, [r0 + r1] 11624 movddup m2, [r0 + r1 * 2] 11625 movddup m3, [r0 + r4] 11626 11627 pabsw m4, m0 11628 pabsw m5, m1 11629 paddw m5, m4 11630 pabsw m4, m2 11631 paddw m5, m4 11632 pabsw m4, m3 11633 paddw m5, m4 11634 pmaddwd m5, [pw_1] 11635 psrldq m4, m5, 4 11636 paddd m5, m4 11637 psrld m6, m5, 2 11638 11639 mova m4, [hmul_8w] 11640 pmaddwd m0, m4 11641 pmaddwd m1, m4 11642 pmaddwd m2, m4 11643 pmaddwd m3, m4 11644 11645 psrldq m4, m0, 4 11646 psubd m5, m0, m4 11647 paddd m0, m4 11648 shufps m0, m5, 10001000b 11649 11650 psrldq m4, m1, 4 11651 psubd m5, m1, m4 11652 paddd m1, m4 11653 shufps m1, m5, 10001000b 11654 11655 psrldq m4, m2, 4 11656 psubd m5, m2, m4 11657 paddd m2, m4 11658 shufps m2, m5, 10001000b 11659 11660 psrldq m4, m3, 4 11661 psubd m5, m3, m4 11662 paddd m3, m4 11663 shufps m3, m5, 10001000b 11664 11665 mova m4, m0 11666 paddd m0, m1 11667 psubd m1, m4 11668 mova m4, m2 11669 paddd m2, m3 11670 psubd m3, m4 11671 mova m4, m0 11672 paddd m0, m2 11673 psubd m2, m4 11674 mova m4, m1 11675 paddd m1, m3 11676 psubd m3, m4 11677 11678 pabsd m0, m0 11679 pabsd m2, m2 11680 pabsd m1, m1 11681 pabsd m3, m3 11682 paddd m0, m2 11683 paddd m1, m3 11684 paddd m0, m1 11685 movhlps m1, m0 11686 paddd m0, m1 11687 psrldq m1, m0, 4 11688 paddd m0, m1 11689 psrld m0, 1 11690 psubd m7, m0, m6 11691 11692 add r3, r3 11693 lea r4, [3 * r3] 11694 movddup m0, [r2] 11695 movddup m1, [r2 + r3] 11696 movddup m2, [r2 + r3 * 2] 11697 movddup m3, [r2 + r4] 11698 11699 pabsw m4, m0 11700 pabsw m5, m1 11701 paddw m5, m4 11702 pabsw m4, m2 11703 paddw m5, m4 11704 pabsw m4, m3 11705 paddw m5, m4 11706 pmaddwd m5, [pw_1] 11707 psrldq m4, m5, 4 11708 paddd m5, m4 11709 psrld m6, m5, 2 11710 11711 mova m4, [hmul_8w] 11712 pmaddwd m0, m4 11713 pmaddwd m1, m4 11714 pmaddwd m2, m4 11715 pmaddwd m3, m4 11716 11717 psrldq m4, m0, 4 11718 psubd m5, m0, m4 11719 paddd m0, m4 11720 shufps m0, m5, 10001000b 11721 11722 psrldq m4, m1, 4 11723 psubd m5, m1, m4 11724 paddd m1, m4 11725 shufps m1, m5, 10001000b 11726 11727 psrldq m4, m2, 4 11728 psubd m5, m2, m4 11729 paddd m2, m4 11730 shufps m2, m5, 10001000b 11731 11732 psrldq m4, m3, 4 11733 psubd m5, m3, m4 11734 paddd m3, m4 11735 shufps m3, m5, 10001000b 11736 11737 mova m4, m0 11738 paddd m0, m1 11739 psubd m1, m4 11740 mova m4, m2 11741 paddd m2, m3 11742 psubd m3, m4 11743 mova m4, m0 11744 paddd m0, m2 11745 psubd m2, m4 11746 mova m4, m1 11747 paddd m1, m3 11748 psubd m3, m4 11749 11750 pabsd m0, m0 11751 pabsd m2, m2 11752 pabsd m1, m1 11753 pabsd m3, m3 11754 paddd m0, m2 11755 paddd m1, m3 11756 paddd m0, m1 11757 movhlps m1, m0 11758 paddd m0, m1 11759 psrldq m1, m0, 4 11760 paddd m0, m1 11761 psrld m0, 1 11762 psubd m0, m6 11763 psubd m7, m0 11764 pabsd m0, m7 11765 movd eax, m0 11766 RET 11767 11768%if ARCH_X86_64 11769INIT_XMM sse4 11770cglobal psyCost_ss_8x8, 4, 6, 15 11771 11772 mova m13, [pw_pmpmpmpm] 11773 mova m14, [pw_1] 11774 add r1, r1 11775 add r3, r3 11776 lea r4, [3 * r1] 11777 movu m0, [r0] 11778 movu m1, [r0 + r1] 11779 movu m2, [r0 + r1 * 2] 11780 movu m3, [r0 + r4] 11781 lea r5, [r0 + r1 * 4] 11782 movu m4, [r5] 11783 movu m5, [r5 + r1] 11784 movu m6, [r5 + r1 * 2] 11785 movu m7, [r5 + r4] 11786 11787 pabsw m8, m0 11788 pabsw m9, m1 11789 paddw m8, m9 11790 pabsw m10, m2 11791 pabsw m11, m3 11792 paddw m10, m11 11793 paddw m8, m10 11794 pabsw m9, m4 11795 pabsw m10, m5 11796 paddw m9, m10 11797 pabsw m11, m6 11798 pabsw m12, m7 11799 paddw m11, m12 11800 paddw m9, m11 11801 paddw m8, m9 11802 movhlps m9, m8 11803 pmovzxwd m8, m8 11804 pmovzxwd m9, m9 11805 paddd m8, m9 11806 movhlps m9, m8 11807 paddd m8, m9 11808 psrldq m9, m8, 4 11809 paddd m8, m9 11810 psrld m8, 2 11811 11812 pmaddwd m0, m13 11813 pmaddwd m1, m13 11814 pmaddwd m2, m13 11815 pmaddwd m3, m13 11816 11817 psrldq m9, m0, 4 11818 psubd m10, m0, m9 11819 paddd m0, m9 11820 shufps m0, m10, 10001000b 11821 psrldq m9, m0, 4 11822 psubd m10, m0, m9 11823 paddd m0, m9 11824 shufps m0, m10, 10001000b 11825 11826 psrldq m9, m1, 4 11827 psubd m10, m1, m9 11828 paddd m1, m9 11829 shufps m1, m10, 10001000b 11830 psrldq m9, m1, 4 11831 psubd m10, m1, m9 11832 paddd m1, m9 11833 shufps m1, m10, 10001000b 11834 11835 psrldq m9, m2, 4 11836 psubd m10, m2, m9 11837 paddd m2, m9 11838 shufps m2, m10, 10001000b 11839 psrldq m9, m2, 4 11840 psubd m10, m2, m9 11841 paddd m2, m9 11842 shufps m2, m10, 10001000b 11843 11844 psrldq m9, m3, 4 11845 psubd m10, m3, m9 11846 paddd m3, m9 11847 shufps m3, m10, 10001000b 11848 psrldq m9, m3, 4 11849 psubd m10, m3, m9 11850 paddd m3, m9 11851 shufps m3, m10, 10001000b 11852 11853 SUMSUB_BA d, 0, 1, 9 11854 SUMSUB_BA d, 2, 3, 9 11855 SUMSUB_BA d, 0, 2, 9 11856 SUMSUB_BA d, 1, 3, 9 11857 11858 pmaddwd m4, m13 11859 pmaddwd m5, m13 11860 pmaddwd m6, m13 11861 pmaddwd m7, m13 11862 11863 psrldq m9, m4, 4 11864 psubd m10, m4, m9 11865 paddd m4, m9 11866 shufps m4, m10, 10001000b 11867 psrldq m9, m4, 4 11868 psubd m10, m4, m9 11869 paddd m4, m9 11870 shufps m4, m10, 10001000b 11871 11872 psrldq m9, m5, 4 11873 psubd m10, m5, m9 11874 paddd m5, m9 11875 shufps m5, m10, 10001000b 11876 psrldq m9, m5, 4 11877 psubd m10, m5, m9 11878 paddd m5, m9 11879 shufps m5, m10, 10001000b 11880 11881 psrldq m9, m6, 4 11882 psubd m10, m6, m9 11883 paddd m6, m9 11884 shufps m6, m10, 10001000b 11885 psrldq m9, m6, 4 11886 psubd m10, m6, m9 11887 paddd m6, m9 11888 shufps m6, m10, 10001000b 11889 11890 psrldq m9, m7, 4 11891 psubd m10, m7, m9 11892 paddd m7, m9 11893 shufps m7, m10, 10001000b 11894 psrldq m9, m7, 4 11895 psubd m10, m7, m9 11896 paddd m7, m9 11897 shufps m7, m10, 10001000b 11898 11899 SUMSUB_BA d, 4, 5, 9 11900 SUMSUB_BA d, 6, 7, 9 11901 SUMSUB_BA d, 4, 6, 9 11902 SUMSUB_BA d, 5, 7, 9 11903 11904 SUMSUB_BA d, 0, 4, 9 11905 SUMSUB_BA d, 1, 5, 9 11906 SUMSUB_BA d, 2, 6, 9 11907 SUMSUB_BA d, 3, 7, 9 11908 11909 pabsd m0, m0 11910 pabsd m2, m2 11911 pabsd m1, m1 11912 pabsd m3, m3 11913 pabsd m4, m4 11914 pabsd m5, m5 11915 pabsd m6, m6 11916 pabsd m7, m7 11917 11918 paddd m0, m2 11919 paddd m1, m3 11920 paddd m0, m1 11921 paddd m5, m4 11922 paddd m0, m5 11923 paddd m7, m6 11924 paddd m11, m0, m7 11925 11926 movu m0, [r0] 11927 movu m1, [r0 + r1] 11928 movu m2, [r0 + r1 * 2] 11929 movu m3, [r0 + r4] 11930 11931 pmaddwd m0, m14 11932 pmaddwd m1, m14 11933 pmaddwd m2, m14 11934 pmaddwd m3, m14 11935 11936 psrldq m9, m0, 4 11937 psubd m10, m0, m9 11938 paddd m0, m9 11939 shufps m0, m10, 10001000b 11940 psrldq m9, m0, 4 11941 psubd m10, m0, m9 11942 paddd m0, m9 11943 shufps m0, m10, 10001000b 11944 11945 psrldq m9, m1, 4 11946 psubd m10, m1, m9 11947 paddd m1, m9 11948 shufps m1, m10, 10001000b 11949 psrldq m9, m1, 4 11950 psubd m10, m1, m9 11951 paddd m1, m9 11952 shufps m1, m10, 10001000b 11953 11954 psrldq m9, m2, 4 11955 psubd m10, m2, m9 11956 paddd m2, m9 11957 shufps m2, m10, 10001000b 11958 psrldq m9, m2, 4 11959 psubd m10, m2, m9 11960 paddd m2, m9 11961 shufps m2, m10, 10001000b 11962 11963 psrldq m9, m3, 4 11964 psubd m10, m3, m9 11965 paddd m3, m9 11966 shufps m3, m10, 10001000b 11967 psrldq m9, m3, 4 11968 psubd m10, m3, m9 11969 paddd m3, m9 11970 shufps m3, m10, 10001000b 11971 11972 SUMSUB_BA d, 0, 1, 9 11973 SUMSUB_BA d, 2, 3, 9 11974 SUMSUB_BA d, 0, 2, 9 11975 SUMSUB_BA d, 1, 3, 9 11976 11977 movu m4, [r5] 11978 movu m5, [r5 + r1] 11979 movu m6, [r5 + r1 * 2] 11980 movu m7, [r5 + r4] 11981 11982 pmaddwd m4, m14 11983 pmaddwd m5, m14 11984 pmaddwd m6, m14 11985 pmaddwd m7, m14 11986 11987 psrldq m9, m4, 4 11988 psubd m10, m4, m9 11989 paddd m4, m9 11990 shufps m4, m10, 10001000b 11991 psrldq m9, m4, 4 11992 psubd m10, m4, m9 11993 paddd m4, m9 11994 shufps m4, m10, 10001000b 11995 11996 psrldq m9, m5, 4 11997 psubd m10, m5, m9 11998 paddd m5, m9 11999 shufps m5, m10, 10001000b 12000 psrldq m9, m5, 4 12001 psubd m10, m5, m9 12002 paddd m5, m9 12003 shufps m5, m10, 10001000b 12004 12005 psrldq m9, m6, 4 12006 psubd m10, m6, m9 12007 paddd m6, m9 12008 shufps m6, m10, 10001000b 12009 psrldq m9, m6, 4 12010 psubd m10, m6, m9 12011 paddd m6, m9 12012 shufps m6, m10, 10001000b 12013 12014 psrldq m9, m7, 4 12015 psubd m10, m7, m9 12016 paddd m7, m9 12017 shufps m7, m10, 10001000b 12018 psrldq m9, m7, 4 12019 psubd m10, m7, m9 12020 paddd m7, m9 12021 shufps m7, m10, 10001000b 12022 12023 SUMSUB_BA d, 4, 5, 9 12024 SUMSUB_BA d, 6, 7, 9 12025 SUMSUB_BA d, 4, 6, 9 12026 SUMSUB_BA d, 5, 7, 9 12027 12028 SUMSUB_BA d, 0, 4, 9 12029 SUMSUB_BA d, 1, 5, 9 12030 SUMSUB_BA d, 2, 6, 9 12031 SUMSUB_BA d, 3, 7, 9 12032 12033 pabsd m0, m0 12034 pabsd m2, m2 12035 pabsd m1, m1 12036 pabsd m3, m3 12037 pabsd m4, m4 12038 pabsd m5, m5 12039 pabsd m6, m6 12040 pabsd m7, m7 12041 12042 paddd m0, m2 12043 paddd m1, m3 12044 paddd m0, m1 12045 paddd m5, m4 12046 paddd m0, m5 12047 paddd m7, m6 12048 paddd m0, m7 12049 paddd m0, m11 12050 12051 movhlps m1, m0 12052 paddd m0, m1 12053 psrldq m1, m0, 4 12054 paddd m0, m1 12055 paddd m0, [pd_2] 12056 psrld m0, 2 12057 psubd m12, m0, m8 12058 12059 lea r4, [3 * r3] 12060 movu m0, [r2] 12061 movu m1, [r2 + r3] 12062 movu m2, [r2 + r3 * 2] 12063 movu m3, [r2 + r4] 12064 lea r5, [r2 + r3 * 4] 12065 movu m4, [r5] 12066 movu m5, [r5 + r3] 12067 movu m6, [r5 + r3 * 2] 12068 movu m7, [r5 + r4] 12069 12070 pabsw m8, m0 12071 pabsw m9, m1 12072 paddw m8, m9 12073 pabsw m10, m2 12074 pabsw m11, m3 12075 paddw m10, m11 12076 paddw m8, m10 12077 pabsw m9, m4 12078 pabsw m10, m5 12079 paddw m9, m10 12080 pabsw m11, m6 12081 pabsw m10, m7 12082 paddw m11, m10 12083 paddw m9, m11 12084 paddw m8, m9 12085 movhlps m9, m8 12086 pmovzxwd m8, m8 12087 pmovzxwd m9, m9 12088 paddd m8, m9 12089 movhlps m9, m8 12090 paddd m8, m9 12091 psrldq m9, m8, 4 12092 paddd m8, m9 12093 psrld m8, 2 12094 12095 pmaddwd m0, m13 12096 pmaddwd m1, m13 12097 pmaddwd m2, m13 12098 pmaddwd m3, m13 12099 12100 psrldq m9, m0, 4 12101 psubd m10, m0, m9 12102 paddd m0, m9 12103 shufps m0, m10, 10001000b 12104 psrldq m9, m0, 4 12105 psubd m10, m0, m9 12106 paddd m0, m9 12107 shufps m0, m10, 10001000b 12108 12109 psrldq m9, m1, 4 12110 psubd m10, m1, m9 12111 paddd m1, m9 12112 shufps m1, m10, 10001000b 12113 psrldq m9, m1, 4 12114 psubd m10, m1, m9 12115 paddd m1, m9 12116 shufps m1, m10, 10001000b 12117 12118 psrldq m9, m2, 4 12119 psubd m10, m2, m9 12120 paddd m2, m9 12121 shufps m2, m10, 10001000b 12122 psrldq m9, m2, 4 12123 psubd m10, m2, m9 12124 paddd m2, m9 12125 shufps m2, m10, 10001000b 12126 12127 psrldq m9, m3, 4 12128 psubd m10, m3, m9 12129 paddd m3, m9 12130 shufps m3, m10, 10001000b 12131 psrldq m9, m3, 4 12132 psubd m10, m3, m9 12133 paddd m3, m9 12134 shufps m3, m10, 10001000b 12135 12136 SUMSUB_BA d, 0, 1, 9 12137 SUMSUB_BA d, 2, 3, 9 12138 SUMSUB_BA d, 0, 2, 9 12139 SUMSUB_BA d, 1, 3, 9 12140 12141 pmaddwd m4, m13 12142 pmaddwd m5, m13 12143 pmaddwd m6, m13 12144 pmaddwd m7, m13 12145 12146 psrldq m9, m4, 4 12147 psubd m10, m4, m9 12148 paddd m4, m9 12149 shufps m4, m10, 10001000b 12150 psrldq m9, m4, 4 12151 psubd m10, m4, m9 12152 paddd m4, m9 12153 shufps m4, m10, 10001000b 12154 12155 psrldq m9, m5, 4 12156 psubd m10, m5, m9 12157 paddd m5, m9 12158 shufps m5, m10, 10001000b 12159 psrldq m9, m5, 4 12160 psubd m10, m5, m9 12161 paddd m5, m9 12162 shufps m5, m10, 10001000b 12163 12164 psrldq m9, m6, 4 12165 psubd m10, m6, m9 12166 paddd m6, m9 12167 shufps m6, m10, 10001000b 12168 psrldq m9, m6, 4 12169 psubd m10, m6, m9 12170 paddd m6, m9 12171 shufps m6, m10, 10001000b 12172 12173 psrldq m9, m7, 4 12174 psubd m10, m7, m9 12175 paddd m7, m9 12176 shufps m7, m10, 10001000b 12177 psrldq m9, m7, 4 12178 psubd m10, m7, m9 12179 paddd m7, m9 12180 shufps m7, m10, 10001000b 12181 12182 SUMSUB_BA d, 4, 5, 9 12183 SUMSUB_BA d, 6, 7, 9 12184 SUMSUB_BA d, 4, 6, 9 12185 SUMSUB_BA d, 5, 7, 9 12186 12187 SUMSUB_BA d, 0, 4, 9 12188 SUMSUB_BA d, 1, 5, 9 12189 SUMSUB_BA d, 2, 6, 9 12190 SUMSUB_BA d, 3, 7, 9 12191 12192 pabsd m0, m0 12193 pabsd m2, m2 12194 pabsd m1, m1 12195 pabsd m3, m3 12196 pabsd m4, m4 12197 pabsd m5, m5 12198 pabsd m6, m6 12199 pabsd m7, m7 12200 12201 paddd m0, m2 12202 paddd m1, m3 12203 paddd m0, m1 12204 paddd m5, m4 12205 paddd m0, m5 12206 paddd m7, m6 12207 paddd m11, m0, m7 12208 12209 movu m0, [r2] 12210 movu m1, [r2 + r3] 12211 movu m2, [r2 + r3 * 2] 12212 movu m3, [r2 + r4] 12213 12214 pmaddwd m0, m14 12215 pmaddwd m1, m14 12216 pmaddwd m2, m14 12217 pmaddwd m3, m14 12218 12219 psrldq m9, m0, 4 12220 psubd m10, m0, m9 12221 paddd m0, m9 12222 shufps m0, m10, 10001000b 12223 psrldq m9, m0, 4 12224 psubd m10, m0, m9 12225 paddd m0, m9 12226 shufps m0, m10, 10001000b 12227 12228 psrldq m9, m1, 4 12229 psubd m10, m1, m9 12230 paddd m1, m9 12231 shufps m1, m10, 10001000b 12232 psrldq m9, m1, 4 12233 psubd m10, m1, m9 12234 paddd m1, m9 12235 shufps m1, m10, 10001000b 12236 12237 psrldq m9, m2, 4 12238 psubd m10, m2, m9 12239 paddd m2, m9 12240 shufps m2, m10, 10001000b 12241 psrldq m9, m2, 4 12242 psubd m10, m2, m9 12243 paddd m2, m9 12244 shufps m2, m10, 10001000b 12245 12246 psrldq m9, m3, 4 12247 psubd m10, m3, m9 12248 paddd m3, m9 12249 shufps m3, m10, 10001000b 12250 psrldq m9, m3, 4 12251 psubd m10, m3, m9 12252 paddd m3, m9 12253 shufps m3, m10, 10001000b 12254 12255 SUMSUB_BA d, 0, 1, 9 12256 SUMSUB_BA d, 2, 3, 9 12257 SUMSUB_BA d, 0, 2, 9 12258 SUMSUB_BA d, 1, 3, 9 12259 12260 movu m4, [r5] 12261 movu m5, [r5 + r3] 12262 movu m6, [r5 + r3 * 2] 12263 movu m7, [r5 + r4] 12264 12265 pmaddwd m4, m14 12266 pmaddwd m5, m14 12267 pmaddwd m6, m14 12268 pmaddwd m7, m14 12269 12270 psrldq m9, m4, 4 12271 psubd m10, m4, m9 12272 paddd m4, m9 12273 shufps m4, m10, 10001000b 12274 psrldq m9, m4, 4 12275 psubd m10, m4, m9 12276 paddd m4, m9 12277 shufps m4, m10, 10001000b 12278 12279 psrldq m9, m5, 4 12280 psubd m10, m5, m9 12281 paddd m5, m9 12282 shufps m5, m10, 10001000b 12283 psrldq m9, m5, 4 12284 psubd m10, m5, m9 12285 paddd m5, m9 12286 shufps m5, m10, 10001000b 12287 12288 psrldq m9, m6, 4 12289 psubd m10, m6, m9 12290 paddd m6, m9 12291 shufps m6, m10, 10001000b 12292 psrldq m9, m6, 4 12293 psubd m10, m6, m9 12294 paddd m6, m9 12295 shufps m6, m10, 10001000b 12296 12297 psrldq m9, m7, 4 12298 psubd m10, m7, m9 12299 paddd m7, m9 12300 shufps m7, m10, 10001000b 12301 psrldq m9, m7, 4 12302 psubd m10, m7, m9 12303 paddd m7, m9 12304 shufps m7, m10, 10001000b 12305 12306 SUMSUB_BA d, 4, 5, 9 12307 SUMSUB_BA d, 6, 7, 9 12308 SUMSUB_BA d, 4, 6, 9 12309 SUMSUB_BA d, 5, 7, 9 12310 12311 SUMSUB_BA d, 0, 4, 9 12312 SUMSUB_BA d, 1, 5, 9 12313 SUMSUB_BA d, 2, 6, 9 12314 SUMSUB_BA d, 3, 7, 9 12315 12316 pabsd m0, m0 12317 pabsd m2, m2 12318 pabsd m1, m1 12319 pabsd m3, m3 12320 pabsd m4, m4 12321 pabsd m5, m5 12322 pabsd m6, m6 12323 pabsd m7, m7 12324 12325 paddd m0, m2 12326 paddd m1, m3 12327 paddd m0, m1 12328 paddd m5, m4 12329 paddd m0, m5 12330 paddd m7, m6 12331 paddd m0, m7 12332 paddd m0, m11 12333 12334 movhlps m1, m0 12335 paddd m0, m1 12336 psrldq m1, m0, 4 12337 paddd m0, m1 12338 paddd m0, [pd_2] 12339 psrld m0, 2 12340 psubd m0, m8 12341 12342 psubd m12, m0 12343 pabsd m0, m12 12344 movd eax, m0 12345 RET 12346%endif 12347 12348%macro psy_cost_ss 0 12349 movu m0, [r0] 12350 movu m1, [r0 + r1] 12351 movu m2, [r0 + r1 * 2] 12352 movu m3, [r0 + r4] 12353 lea r5, [r0 + r1 * 4] 12354 movu m4, [r5] 12355 movu m5, [r5 + r1] 12356 movu m6, [r5 + r1 * 2] 12357 movu m7, [r5 + r4] 12358 12359 pabsw m8, m0 12360 pabsw m9, m1 12361 paddw m8, m9 12362 pabsw m10, m2 12363 pabsw m11, m3 12364 paddw m10, m11 12365 paddw m8, m10 12366 pabsw m9, m4 12367 pabsw m10, m5 12368 paddw m9, m10 12369 pabsw m11, m6 12370 pabsw m12, m7 12371 paddw m11, m12 12372 paddw m9, m11 12373 paddw m8, m9 12374 movhlps m9, m8 12375 pmovzxwd m8, m8 12376 pmovzxwd m9, m9 12377 paddd m8, m9 12378 movhlps m9, m8 12379 paddd m8, m9 12380 psrldq m9, m8, 4 12381 paddd m8, m9 12382 psrld m8, 2 12383 12384 pmaddwd m0, m13 12385 pmaddwd m1, m13 12386 pmaddwd m2, m13 12387 pmaddwd m3, m13 12388 12389 psrldq m9, m0, 4 12390 psubd m10, m0, m9 12391 paddd m0, m9 12392 shufps m0, m10, 10001000b 12393 psrldq m9, m0, 4 12394 psubd m10, m0, m9 12395 paddd m0, m9 12396 shufps m0, m10, 10001000b 12397 12398 psrldq m9, m1, 4 12399 psubd m10, m1, m9 12400 paddd m1, m9 12401 shufps m1, m10, 10001000b 12402 psrldq m9, m1, 4 12403 psubd m10, m1, m9 12404 paddd m1, m9 12405 shufps m1, m10, 10001000b 12406 12407 psrldq m9, m2, 4 12408 psubd m10, m2, m9 12409 paddd m2, m9 12410 shufps m2, m10, 10001000b 12411 psrldq m9, m2, 4 12412 psubd m10, m2, m9 12413 paddd m2, m9 12414 shufps m2, m10, 10001000b 12415 12416 psrldq m9, m3, 4 12417 psubd m10, m3, m9 12418 paddd m3, m9 12419 shufps m3, m10, 10001000b 12420 psrldq m9, m3, 4 12421 psubd m10, m3, m9 12422 paddd m3, m9 12423 shufps m3, m10, 10001000b 12424 12425 SUMSUB_BA d, 0, 1, 9 12426 SUMSUB_BA d, 2, 3, 9 12427 SUMSUB_BA d, 0, 2, 9 12428 SUMSUB_BA d, 1, 3, 9 12429 12430 pmaddwd m4, m13 12431 pmaddwd m5, m13 12432 pmaddwd m6, m13 12433 pmaddwd m7, m13 12434 12435 psrldq m9, m4, 4 12436 psubd m10, m4, m9 12437 paddd m4, m9 12438 shufps m4, m10, 10001000b 12439 psrldq m9, m4, 4 12440 psubd m10, m4, m9 12441 paddd m4, m9 12442 shufps m4, m10, 10001000b 12443 12444 psrldq m9, m5, 4 12445 psubd m10, m5, m9 12446 paddd m5, m9 12447 shufps m5, m10, 10001000b 12448 psrldq m9, m5, 4 12449 psubd m10, m5, m9 12450 paddd m5, m9 12451 shufps m5, m10, 10001000b 12452 12453 psrldq m9, m6, 4 12454 psubd m10, m6, m9 12455 paddd m6, m9 12456 shufps m6, m10, 10001000b 12457 psrldq m9, m6, 4 12458 psubd m10, m6, m9 12459 paddd m6, m9 12460 shufps m6, m10, 10001000b 12461 12462 psrldq m9, m7, 4 12463 psubd m10, m7, m9 12464 paddd m7, m9 12465 shufps m7, m10, 10001000b 12466 psrldq m9, m7, 4 12467 psubd m10, m7, m9 12468 paddd m7, m9 12469 shufps m7, m10, 10001000b 12470 12471 SUMSUB_BA d, 4, 5, 9 12472 SUMSUB_BA d, 6, 7, 9 12473 SUMSUB_BA d, 4, 6, 9 12474 SUMSUB_BA d, 5, 7, 9 12475 12476 SUMSUB_BA d, 0, 4, 9 12477 SUMSUB_BA d, 1, 5, 9 12478 SUMSUB_BA d, 2, 6, 9 12479 SUMSUB_BA d, 3, 7, 9 12480 12481 pabsd m0, m0 12482 pabsd m2, m2 12483 pabsd m1, m1 12484 pabsd m3, m3 12485 pabsd m4, m4 12486 pabsd m5, m5 12487 pabsd m6, m6 12488 pabsd m7, m7 12489 12490 paddd m0, m2 12491 paddd m1, m3 12492 paddd m0, m1 12493 paddd m5, m4 12494 paddd m0, m5 12495 paddd m7, m6 12496 paddd m11, m0, m7 12497 12498 movu m0, [r0] 12499 movu m1, [r0 + r1] 12500 movu m2, [r0 + r1 * 2] 12501 movu m3, [r0 + r4] 12502 12503 pmaddwd m0, m14 12504 pmaddwd m1, m14 12505 pmaddwd m2, m14 12506 pmaddwd m3, m14 12507 12508 psrldq m9, m0, 4 12509 psubd m10, m0, m9 12510 paddd m0, m9 12511 shufps m0, m10, 10001000b 12512 psrldq m9, m0, 4 12513 psubd m10, m0, m9 12514 paddd m0, m9 12515 shufps m0, m10, 10001000b 12516 12517 psrldq m9, m1, 4 12518 psubd m10, m1, m9 12519 paddd m1, m9 12520 shufps m1, m10, 10001000b 12521 psrldq m9, m1, 4 12522 psubd m10, m1, m9 12523 paddd m1, m9 12524 shufps m1, m10, 10001000b 12525 12526 psrldq m9, m2, 4 12527 psubd m10, m2, m9 12528 paddd m2, m9 12529 shufps m2, m10, 10001000b 12530 psrldq m9, m2, 4 12531 psubd m10, m2, m9 12532 paddd m2, m9 12533 shufps m2, m10, 10001000b 12534 12535 psrldq m9, m3, 4 12536 psubd m10, m3, m9 12537 paddd m3, m9 12538 shufps m3, m10, 10001000b 12539 psrldq m9, m3, 4 12540 psubd m10, m3, m9 12541 paddd m3, m9 12542 shufps m3, m10, 10001000b 12543 12544 SUMSUB_BA d, 0, 1, 9 12545 SUMSUB_BA d, 2, 3, 9 12546 SUMSUB_BA d, 0, 2, 9 12547 SUMSUB_BA d, 1, 3, 9 12548 12549 movu m4, [r5] 12550 movu m5, [r5 + r1] 12551 movu m6, [r5 + r1 * 2] 12552 movu m7, [r5 + r4] 12553 12554 pmaddwd m4, m14 12555 pmaddwd m5, m14 12556 pmaddwd m6, m14 12557 pmaddwd m7, m14 12558 12559 psrldq m9, m4, 4 12560 psubd m10, m4, m9 12561 paddd m4, m9 12562 shufps m4, m10, 10001000b 12563 psrldq m9, m4, 4 12564 psubd m10, m4, m9 12565 paddd m4, m9 12566 shufps m4, m10, 10001000b 12567 12568 psrldq m9, m5, 4 12569 psubd m10, m5, m9 12570 paddd m5, m9 12571 shufps m5, m10, 10001000b 12572 psrldq m9, m5, 4 12573 psubd m10, m5, m9 12574 paddd m5, m9 12575 shufps m5, m10, 10001000b 12576 12577 psrldq m9, m6, 4 12578 psubd m10, m6, m9 12579 paddd m6, m9 12580 shufps m6, m10, 10001000b 12581 psrldq m9, m6, 4 12582 psubd m10, m6, m9 12583 paddd m6, m9 12584 shufps m6, m10, 10001000b 12585 12586 psrldq m9, m7, 4 12587 psubd m10, m7, m9 12588 paddd m7, m9 12589 shufps m7, m10, 10001000b 12590 psrldq m9, m7, 4 12591 psubd m10, m7, m9 12592 paddd m7, m9 12593 shufps m7, m10, 10001000b 12594 12595 SUMSUB_BA d, 4, 5, 9 12596 SUMSUB_BA d, 6, 7, 9 12597 SUMSUB_BA d, 4, 6, 9 12598 SUMSUB_BA d, 5, 7, 9 12599 12600 SUMSUB_BA d, 0, 4, 9 12601 SUMSUB_BA d, 1, 5, 9 12602 SUMSUB_BA d, 2, 6, 9 12603 SUMSUB_BA d, 3, 7, 9 12604 12605 pabsd m0, m0 12606 pabsd m2, m2 12607 pabsd m1, m1 12608 pabsd m3, m3 12609 pabsd m4, m4 12610 pabsd m5, m5 12611 pabsd m6, m6 12612 pabsd m7, m7 12613 12614 paddd m0, m2 12615 paddd m1, m3 12616 paddd m0, m1 12617 paddd m5, m4 12618 paddd m0, m5 12619 paddd m7, m6 12620 paddd m0, m7 12621 paddd m0, m11 12622 12623 movhlps m1, m0 12624 paddd m0, m1 12625 psrldq m1, m0, 4 12626 paddd m0, m1 12627 paddd m0, [pd_2] 12628 psrld m0, 2 12629 psubd m12, m0, m8 12630 12631 movu m0, [r2] 12632 movu m1, [r2 + r3] 12633 movu m2, [r2 + r3 * 2] 12634 movu m3, [r2 + r6] 12635 lea r5, [r2 + r3 * 4] 12636 movu m4, [r5] 12637 movu m5, [r5 + r3] 12638 movu m6, [r5 + r3 * 2] 12639 movu m7, [r5 + r6] 12640 12641 pabsw m8, m0 12642 pabsw m9, m1 12643 paddw m8, m9 12644 pabsw m10, m2 12645 pabsw m11, m3 12646 paddw m10, m11 12647 paddw m8, m10 12648 pabsw m9, m4 12649 pabsw m10, m5 12650 paddw m9, m10 12651 pabsw m11, m6 12652 pabsw m10, m7 12653 paddw m11, m10 12654 paddw m9, m11 12655 paddw m8, m9 12656 movhlps m9, m8 12657 pmovzxwd m8, m8 12658 pmovzxwd m9, m9 12659 paddd m8, m9 12660 movhlps m9, m8 12661 paddd m8, m9 12662 psrldq m9, m8, 4 12663 paddd m8, m9 12664 psrld m8, 2 12665 12666 pmaddwd m0, m13 12667 pmaddwd m1, m13 12668 pmaddwd m2, m13 12669 pmaddwd m3, m13 12670 12671 psrldq m9, m0, 4 12672 psubd m10, m0, m9 12673 paddd m0, m9 12674 shufps m0, m10, 10001000b 12675 psrldq m9, m0, 4 12676 psubd m10, m0, m9 12677 paddd m0, m9 12678 shufps m0, m10, 10001000b 12679 12680 psrldq m9, m1, 4 12681 psubd m10, m1, m9 12682 paddd m1, m9 12683 shufps m1, m10, 10001000b 12684 psrldq m9, m1, 4 12685 psubd m10, m1, m9 12686 paddd m1, m9 12687 shufps m1, m10, 10001000b 12688 12689 psrldq m9, m2, 4 12690 psubd m10, m2, m9 12691 paddd m2, m9 12692 shufps m2, m10, 10001000b 12693 psrldq m9, m2, 4 12694 psubd m10, m2, m9 12695 paddd m2, m9 12696 shufps m2, m10, 10001000b 12697 12698 psrldq m9, m3, 4 12699 psubd m10, m3, m9 12700 paddd m3, m9 12701 shufps m3, m10, 10001000b 12702 psrldq m9, m3, 4 12703 psubd m10, m3, m9 12704 paddd m3, m9 12705 shufps m3, m10, 10001000b 12706 12707 SUMSUB_BA d, 0, 1, 9 12708 SUMSUB_BA d, 2, 3, 9 12709 SUMSUB_BA d, 0, 2, 9 12710 SUMSUB_BA d, 1, 3, 9 12711 12712 pmaddwd m4, m13 12713 pmaddwd m5, m13 12714 pmaddwd m6, m13 12715 pmaddwd m7, m13 12716 12717 psrldq m9, m4, 4 12718 psubd m10, m4, m9 12719 paddd m4, m9 12720 shufps m4, m10, 10001000b 12721 psrldq m9, m4, 4 12722 psubd m10, m4, m9 12723 paddd m4, m9 12724 shufps m4, m10, 10001000b 12725 12726 psrldq m9, m5, 4 12727 psubd m10, m5, m9 12728 paddd m5, m9 12729 shufps m5, m10, 10001000b 12730 psrldq m9, m5, 4 12731 psubd m10, m5, m9 12732 paddd m5, m9 12733 shufps m5, m10, 10001000b 12734 12735 psrldq m9, m6, 4 12736 psubd m10, m6, m9 12737 paddd m6, m9 12738 shufps m6, m10, 10001000b 12739 psrldq m9, m6, 4 12740 psubd m10, m6, m9 12741 paddd m6, m9 12742 shufps m6, m10, 10001000b 12743 12744 psrldq m9, m7, 4 12745 psubd m10, m7, m9 12746 paddd m7, m9 12747 shufps m7, m10, 10001000b 12748 psrldq m9, m7, 4 12749 psubd m10, m7, m9 12750 paddd m7, m9 12751 shufps m7, m10, 10001000b 12752 12753 SUMSUB_BA d, 4, 5, 9 12754 SUMSUB_BA d, 6, 7, 9 12755 SUMSUB_BA d, 4, 6, 9 12756 SUMSUB_BA d, 5, 7, 9 12757 12758 SUMSUB_BA d, 0, 4, 9 12759 SUMSUB_BA d, 1, 5, 9 12760 SUMSUB_BA d, 2, 6, 9 12761 SUMSUB_BA d, 3, 7, 9 12762 12763 pabsd m0, m0 12764 pabsd m2, m2 12765 pabsd m1, m1 12766 pabsd m3, m3 12767 pabsd m4, m4 12768 pabsd m5, m5 12769 pabsd m6, m6 12770 pabsd m7, m7 12771 12772 paddd m0, m2 12773 paddd m1, m3 12774 paddd m0, m1 12775 paddd m5, m4 12776 paddd m0, m5 12777 paddd m7, m6 12778 paddd m11, m0, m7 12779 12780 movu m0, [r2] 12781 movu m1, [r2 + r3] 12782 movu m2, [r2 + r3 * 2] 12783 movu m3, [r2 + r6] 12784 12785 pmaddwd m0, m14 12786 pmaddwd m1, m14 12787 pmaddwd m2, m14 12788 pmaddwd m3, m14 12789 12790 psrldq m9, m0, 4 12791 psubd m10, m0, m9 12792 paddd m0, m9 12793 shufps m0, m10, 10001000b 12794 psrldq m9, m0, 4 12795 psubd m10, m0, m9 12796 paddd m0, m9 12797 shufps m0, m10, 10001000b 12798 12799 psrldq m9, m1, 4 12800 psubd m10, m1, m9 12801 paddd m1, m9 12802 shufps m1, m10, 10001000b 12803 psrldq m9, m1, 4 12804 psubd m10, m1, m9 12805 paddd m1, m9 12806 shufps m1, m10, 10001000b 12807 12808 psrldq m9, m2, 4 12809 psubd m10, m2, m9 12810 paddd m2, m9 12811 shufps m2, m10, 10001000b 12812 psrldq m9, m2, 4 12813 psubd m10, m2, m9 12814 paddd m2, m9 12815 shufps m2, m10, 10001000b 12816 12817 psrldq m9, m3, 4 12818 psubd m10, m3, m9 12819 paddd m3, m9 12820 shufps m3, m10, 10001000b 12821 psrldq m9, m3, 4 12822 psubd m10, m3, m9 12823 paddd m3, m9 12824 shufps m3, m10, 10001000b 12825 12826 SUMSUB_BA d, 0, 1, 9 12827 SUMSUB_BA d, 2, 3, 9 12828 SUMSUB_BA d, 0, 2, 9 12829 SUMSUB_BA d, 1, 3, 9 12830 12831 movu m4, [r5] 12832 movu m5, [r5 + r3] 12833 movu m6, [r5 + r3 * 2] 12834 movu m7, [r5 + r6] 12835 12836 pmaddwd m4, m14 12837 pmaddwd m5, m14 12838 pmaddwd m6, m14 12839 pmaddwd m7, m14 12840 12841 psrldq m9, m4, 4 12842 psubd m10, m4, m9 12843 paddd m4, m9 12844 shufps m4, m10, 10001000b 12845 psrldq m9, m4, 4 12846 psubd m10, m4, m9 12847 paddd m4, m9 12848 shufps m4, m10, 10001000b 12849 12850 psrldq m9, m5, 4 12851 psubd m10, m5, m9 12852 paddd m5, m9 12853 shufps m5, m10, 10001000b 12854 psrldq m9, m5, 4 12855 psubd m10, m5, m9 12856 paddd m5, m9 12857 shufps m5, m10, 10001000b 12858 12859 psrldq m9, m6, 4 12860 psubd m10, m6, m9 12861 paddd m6, m9 12862 shufps m6, m10, 10001000b 12863 psrldq m9, m6, 4 12864 psubd m10, m6, m9 12865 paddd m6, m9 12866 shufps m6, m10, 10001000b 12867 12868 psrldq m9, m7, 4 12869 psubd m10, m7, m9 12870 paddd m7, m9 12871 shufps m7, m10, 10001000b 12872 psrldq m9, m7, 4 12873 psubd m10, m7, m9 12874 paddd m7, m9 12875 shufps m7, m10, 10001000b 12876 12877 SUMSUB_BA d, 4, 5, 9 12878 SUMSUB_BA d, 6, 7, 9 12879 SUMSUB_BA d, 4, 6, 9 12880 SUMSUB_BA d, 5, 7, 9 12881 12882 SUMSUB_BA d, 0, 4, 9 12883 SUMSUB_BA d, 1, 5, 9 12884 SUMSUB_BA d, 2, 6, 9 12885 SUMSUB_BA d, 3, 7, 9 12886 12887 pabsd m0, m0 12888 pabsd m2, m2 12889 pabsd m1, m1 12890 pabsd m3, m3 12891 pabsd m4, m4 12892 pabsd m5, m5 12893 pabsd m6, m6 12894 pabsd m7, m7 12895 12896 paddd m0, m2 12897 paddd m1, m3 12898 paddd m0, m1 12899 paddd m5, m4 12900 paddd m0, m5 12901 paddd m7, m6 12902 paddd m0, m7 12903 paddd m0, m11 12904 12905 movhlps m1, m0 12906 paddd m0, m1 12907 psrldq m1, m0, 4 12908 paddd m0, m1 12909 paddd m0, [pd_2] 12910 psrld m0, 2 12911 psubd m0, m8 12912 12913 psubd m12, m0 12914 pabsd m0, m12 12915 paddd m15, m0 12916%endmacro 12917 12918%if ARCH_X86_64 12919INIT_XMM sse4 12920cglobal psyCost_ss_16x16, 4, 9, 16 12921 12922 mova m13, [pw_pmpmpmpm] 12923 mova m14, [pw_1] 12924 add r1, r1 12925 add r3, r3 12926 lea r4, [3 * r1] 12927 lea r6, [3 * r3] 12928 pxor m15, m15 12929 mov r7d, 2 12930.loopH: 12931 mov r8d, 2 12932.loopW: 12933 psy_cost_ss 12934 add r0, 16 12935 add r2, 16 12936 dec r8d 12937 jnz .loopW 12938 lea r0, [r0 + r1 * 8 - 32] 12939 lea r2, [r2 + r3 * 8 - 32] 12940 dec r7d 12941 jnz .loopH 12942 movd eax, m15 12943 RET 12944%endif 12945 12946%if ARCH_X86_64 12947INIT_XMM sse4 12948cglobal psyCost_ss_32x32, 4, 9, 16 12949 12950 mova m13, [pw_pmpmpmpm] 12951 mova m14, [pw_1] 12952 add r1, r1 12953 add r3, r3 12954 lea r4, [3 * r1] 12955 lea r6, [3 * r3] 12956 pxor m15, m15 12957 mov r7d, 4 12958.loopH: 12959 mov r8d, 4 12960.loopW: 12961 psy_cost_ss 12962 add r0, 16 12963 add r2, 16 12964 dec r8d 12965 jnz .loopW 12966 lea r0, [r0 + r1 * 8 - 64] 12967 lea r2, [r2 + r3 * 8 - 64] 12968 dec r7d 12969 jnz .loopH 12970 movd eax, m15 12971 RET 12972%endif 12973 12974%if ARCH_X86_64 12975INIT_XMM sse4 12976cglobal psyCost_ss_64x64, 4, 9, 16 12977 12978 mova m13, [pw_pmpmpmpm] 12979 mova m14, [pw_1] 12980 add r1, r1 12981 add r3, r3 12982 lea r4, [3 * r1] 12983 lea r6, [3 * r3] 12984 pxor m15, m15 12985 mov r7d, 8 12986.loopH: 12987 mov r8d, 8 12988.loopW: 12989 psy_cost_ss 12990 add r0, 16 12991 add r2, 16 12992 dec r8d 12993 jnz .loopW 12994 lea r0, [r0 + r1 * 8 - 128] 12995 lea r2, [r2 + r3 * 8 - 128] 12996 dec r7d 12997 jnz .loopH 12998 movd eax, m15 12999 RET 13000%endif 13001 13002INIT_YMM avx2 13003cglobal psyCost_ss_4x4, 4, 5, 8 13004 add r1, r1 13005 add r3, r3 13006 lea r4, [3 * r1] 13007 movddup m0, [r0] 13008 movddup m1, [r0 + r1] 13009 movddup m2, [r0 + r1 * 2] 13010 movddup m3, [r0 + r4] 13011 13012 lea r4, [3 * r3] 13013 movddup m4, [r2] 13014 movddup m5, [r2 + r3] 13015 movddup m6, [r2 + r3 * 2] 13016 movddup m7, [r2 + r4] 13017 13018 vinserti128 m0, m0, xm4, 1 13019 vinserti128 m1, m1, xm5, 1 13020 vinserti128 m2, m2, xm6, 1 13021 vinserti128 m3, m3, xm7, 1 13022 13023 pabsw m4, m0 13024 pabsw m5, m1 13025 paddw m5, m4 13026 pabsw m4, m2 13027 paddw m5, m4 13028 pabsw m4, m3 13029 paddw m5, m4 13030 pmaddwd m5, [pw_1] 13031 psrldq m4, m5, 4 13032 paddd m5, m4 13033 psrld m6, m5, 2 13034 13035 mova m4, [hmul_8w] 13036 pmaddwd m0, m4 13037 pmaddwd m1, m4 13038 pmaddwd m2, m4 13039 pmaddwd m3, m4 13040 13041 psrldq m4, m0, 4 13042 psubd m5, m0, m4 13043 paddd m0, m4 13044 shufps m0, m0, m5, 10001000b 13045 13046 psrldq m4, m1, 4 13047 psubd m5, m1, m4 13048 paddd m1, m4 13049 shufps m1, m1, m5, 10001000b 13050 13051 psrldq m4, m2, 4 13052 psubd m5, m2, m4 13053 paddd m2, m4 13054 shufps m2, m2, m5, 10001000b 13055 13056 psrldq m4, m3, 4 13057 psubd m5, m3, m4 13058 paddd m3, m4 13059 shufps m3, m3, m5, 10001000b 13060 13061 mova m4, m0 13062 paddd m0, m1 13063 psubd m1, m4 13064 mova m4, m2 13065 paddd m2, m3 13066 psubd m3, m4 13067 mova m4, m0 13068 paddd m0, m2 13069 psubd m2, m4 13070 mova m4, m1 13071 paddd m1, m3 13072 psubd m3, m4 13073 13074 pabsd m0, m0 13075 pabsd m2, m2 13076 pabsd m1, m1 13077 pabsd m3, m3 13078 paddd m0, m2 13079 paddd m1, m3 13080 paddd m0, m1 13081 psrldq m1, m0, 8 13082 paddd m0, m1 13083 psrldq m1, m0, 4 13084 paddd m0, m1 13085 psrld m0, 1 13086 psubd m0, m6 13087 vextracti128 xm1, m0, 1 13088 psubd m0, m1 13089 pabsd m0, m0 13090 movd eax, xm0 13091 RET 13092 13093%macro PSY_SS_8x8 0 13094 lea r4, [3 * r1] 13095 lea r6, [r0 + r1 * 4] 13096 movu xm0, [r0] 13097 movu xm1, [r0 + r1] 13098 movu xm2, [r0 + r1 * 2] 13099 movu xm3, [r0 + r4] 13100 movu xm4, [r6] 13101 movu xm5, [r6 + r1] 13102 movu xm6, [r6 + r1 * 2] 13103 movu xm7, [r6 + r4] 13104 13105 lea r4, [3 * r3] 13106 lea r6, [r2 + r3 * 4] 13107 movu xm8, [r2] 13108 movu xm9, [r2 + r3] 13109 movu xm10, [r2 + r3 * 2] 13110 movu xm11, [r2 + r4] 13111 vinserti128 m0, m0, xm8, 1 13112 vinserti128 m1, m1, xm9, 1 13113 vinserti128 m2, m2, xm10, 1 13114 vinserti128 m3, m3, xm11, 1 13115 movu xm8, [r6] 13116 movu xm9, [r6 + r3] 13117 movu xm10, [r6 + r3 * 2] 13118 movu xm11, [r6 + r4] 13119 vinserti128 m4, m4, xm8, 1 13120 vinserti128 m5, m5, xm9, 1 13121 vinserti128 m6, m6, xm10, 1 13122 vinserti128 m7, m7, xm11, 1 13123 13124 ;; store on stack to use later 13125 mova [rsp + 0 * mmsize], m0 13126 mova [rsp + 1 * mmsize], m1 13127 mova [rsp + 2 * mmsize], m2 13128 mova [rsp + 3 * mmsize], m3 13129 mova [rsp + 4 * mmsize], m4 13130 mova [rsp + 5 * mmsize], m5 13131 mova [rsp + 6 * mmsize], m6 13132 mova [rsp + 7 * mmsize], m7 13133 13134 pabsw m8, m0 13135 pabsw m9, m1 13136 paddw m8, m9 13137 pabsw m10, m2 13138 pabsw m11, m3 13139 paddw m10, m11 13140 paddw m8, m10 13141 pabsw m9, m4 13142 pabsw m10, m5 13143 paddw m9, m10 13144 pabsw m11, m6 13145 pabsw m10, m7 13146 paddw m11, m10 13147 paddw m9, m11 13148 paddw m8, m9 13149 psrldq m9, m8, 8 13150 13151 vextracti128 xm10, m8, 1 13152 vextracti128 xm11, m9, 1 13153 13154 vpmovzxwd m8, xm8 13155 vpmovzxwd m9, xm9 13156 vpmovzxwd m10, xm10 13157 vpmovzxwd m11, xm11 13158 13159 vinserti128 m8, m8, xm10, 1 13160 vinserti128 m9, m9, xm11, 1 13161 13162 paddd m8, m9 13163 psrldq m9, m8, 8 13164 paddd m8, m9 13165 psrldq m9, m8, 4 13166 paddd m8, m9 13167 psrld m8, 2 ; sad_4x4 13168 13169 pmaddwd m0, m13 13170 pmaddwd m1, m13 13171 pmaddwd m2, m13 13172 pmaddwd m3, m13 13173 13174 psrldq m9, m0, 4 13175 psubd m10, m0, m9 13176 paddd m0, m9 13177 vshufps m0, m0, m10, 10001000b 13178 psrldq m9, m0, 4 13179 psubd m10, m0, m9 13180 paddd m0, m9 13181 vshufps m0, m0, m10, 10001000b 13182 13183 psrldq m9, m1, 4 13184 psubd m10, m1, m9 13185 paddd m1, m9 13186 vshufps m1, m1, m10, 10001000b 13187 psrldq m9, m1, 4 13188 psubd m10, m1, m9 13189 paddd m1, m9 13190 vshufps m1, m1, m10, 10001000b 13191 13192 psrldq m9, m2, 4 13193 psubd m10, m2, m9 13194 paddd m2, m9 13195 vshufps m2, m2, m10, 10001000b 13196 psrldq m9, m2, 4 13197 psubd m10, m2, m9 13198 paddd m2, m9 13199 vshufps m2, m2, m10, 10001000b 13200 13201 psrldq m9, m3, 4 13202 psubd m10, m3, m9 13203 paddd m3, m9 13204 vshufps m3, m3, m10, 10001000b 13205 psrldq m9, m3, 4 13206 psubd m10, m3, m9 13207 paddd m3, m9 13208 vshufps m3, m3, m10, 10001000b 13209 13210 SUMSUB_BA d, 0, 1, 9 13211 SUMSUB_BA d, 2, 3, 9 13212 SUMSUB_BA d, 0, 2, 9 13213 SUMSUB_BA d, 1, 3, 9 13214 13215 pmaddwd m4, m13 13216 pmaddwd m5, m13 13217 pmaddwd m6, m13 13218 pmaddwd m7, m13 13219 13220 psrldq m9, m4, 4 13221 psubd m10, m4, m9 13222 paddd m4, m9 13223 vshufps m4, m4, m10, 10001000b 13224 psrldq m9, m4, 4 13225 psubd m10, m4, m9 13226 paddd m4, m9 13227 vshufps m4, m4, m10, 10001000b 13228 13229 psrldq m9, m5, 4 13230 psubd m10, m5, m9 13231 paddd m5, m9 13232 vshufps m5, m5, m10, 10001000b 13233 psrldq m9, m5, 4 13234 psubd m10, m5, m9 13235 paddd m5, m9 13236 vshufps m5, m5, m10, 10001000b 13237 13238 psrldq m9, m6, 4 13239 psubd m10, m6, m9 13240 paddd m6, m9 13241 vshufps m6, m6, m10, 10001000b 13242 psrldq m9, m6, 4 13243 psubd m10, m6, m9 13244 paddd m6, m9 13245 vshufps m6, m6, m10, 10001000b 13246 13247 psrldq m9, m7, 4 13248 psubd m10, m7, m9 13249 paddd m7, m9 13250 vshufps m7, m7, m10, 10001000b 13251 psrldq m9, m7, 4 13252 psubd m10, m7, m9 13253 paddd m7, m9 13254 vshufps m7, m7, m10, 10001000b 13255 13256 SUMSUB_BA d, 4, 5, 9 13257 SUMSUB_BA d, 6, 7, 9 13258 SUMSUB_BA d, 4, 6, 9 13259 SUMSUB_BA d, 5, 7, 9 13260 13261 SUMSUB_BA d, 0, 4, 9 13262 SUMSUB_BA d, 1, 5, 9 13263 SUMSUB_BA d, 2, 6, 9 13264 SUMSUB_BA d, 3, 7, 9 13265 13266 pabsd m0, m0 13267 pabsd m2, m2 13268 pabsd m1, m1 13269 pabsd m3, m3 13270 pabsd m4, m4 13271 pabsd m5, m5 13272 pabsd m6, m6 13273 pabsd m7, m7 13274 13275 paddd m0, m2 13276 paddd m1, m3 13277 paddd m0, m1 13278 paddd m5, m4 13279 paddd m0, m5 13280 paddd m7, m6 13281 paddd m11, m0, m7 13282 13283 pmaddwd m0, m12, [rsp + 0 * mmsize] 13284 pmaddwd m1, m12, [rsp + 1 * mmsize] 13285 pmaddwd m2, m12, [rsp + 2 * mmsize] 13286 pmaddwd m3, m12, [rsp + 3 * mmsize] 13287 13288 psrldq m9, m0, 4 13289 psubd m10, m0, m9 13290 paddd m0, m9 13291 vshufps m0, m0, m10, 10001000b 13292 psrldq m9, m0, 4 13293 psubd m10, m0, m9 13294 paddd m0, m9 13295 vshufps m0, m0, m10, 10001000b 13296 13297 psrldq m9, m1, 4 13298 psubd m10, m1, m9 13299 paddd m1, m9 13300 vshufps m1, m1, m10, 10001000b 13301 psrldq m9, m1, 4 13302 psubd m10, m1, m9 13303 paddd m1, m9 13304 vshufps m1, m1, m10, 10001000b 13305 13306 psrldq m9, m2, 4 13307 psubd m10, m2, m9 13308 paddd m2, m9 13309 vshufps m2, m2, m10, 10001000b 13310 psrldq m9, m2, 4 13311 psubd m10, m2, m9 13312 paddd m2, m9 13313 vshufps m2, m2, m10, 10001000b 13314 13315 psrldq m9, m3, 4 13316 psubd m10, m3, m9 13317 paddd m3, m9 13318 vshufps m3, m3, m10, 10001000b 13319 psrldq m9, m3, 4 13320 psubd m10, m3, m9 13321 paddd m3, m9 13322 vshufps m3, m3, m10, 10001000b 13323 13324 SUMSUB_BA d, 0, 1, 9 13325 SUMSUB_BA d, 2, 3, 9 13326 SUMSUB_BA d, 0, 2, 9 13327 SUMSUB_BA d, 1, 3, 9 13328 13329 pmaddwd m4, m12, [rsp + 4 * mmsize] 13330 pmaddwd m5, m12, [rsp + 5 * mmsize] 13331 pmaddwd m6, m12, [rsp + 6 * mmsize] 13332 pmaddwd m7, m12, [rsp + 7 * mmsize] 13333 13334 psrldq m9, m4, 4 13335 psubd m10, m4, m9 13336 paddd m4, m9 13337 vshufps m4, m4, m10, 10001000b 13338 psrldq m9, m4, 4 13339 psubd m10, m4, m9 13340 paddd m4, m9 13341 vshufps m4, m4, m10, 10001000b 13342 13343 psrldq m9, m5, 4 13344 psubd m10, m5, m9 13345 paddd m5, m9 13346 vshufps m5, m5, m10, 10001000b 13347 psrldq m9, m5, 4 13348 psubd m10, m5, m9 13349 paddd m5, m9 13350 vshufps m5, m5, m10, 10001000b 13351 13352 psrldq m9, m6, 4 13353 psubd m10, m6, m9 13354 paddd m6, m9 13355 vshufps m6, m6, m10, 10001000b 13356 psrldq m9, m6, 4 13357 psubd m10, m6, m9 13358 paddd m6, m9 13359 vshufps m6, m6, m10, 10001000b 13360 13361 psrldq m9, m7, 4 13362 psubd m10, m7, m9 13363 paddd m7, m9 13364 vshufps m7, m7, m10, 10001000b 13365 psrldq m9, m7, 4 13366 psubd m10, m7, m9 13367 paddd m7, m9 13368 vshufps m7, m7, m10, 10001000b 13369 13370 SUMSUB_BA d, 4, 5, 9 13371 SUMSUB_BA d, 6, 7, 9 13372 SUMSUB_BA d, 4, 6, 9 13373 SUMSUB_BA d, 5, 7, 9 13374 13375 SUMSUB_BA d, 0, 4, 9 13376 SUMSUB_BA d, 1, 5, 9 13377 SUMSUB_BA d, 2, 6, 9 13378 SUMSUB_BA d, 3, 7, 9 13379 13380 pabsd m0, m0 13381 pabsd m2, m2 13382 pabsd m1, m1 13383 pabsd m3, m3 13384 pabsd m4, m4 13385 pabsd m5, m5 13386 pabsd m6, m6 13387 pabsd m7, m7 13388 13389 paddd m0, m2 13390 paddd m1, m3 13391 paddd m0, m1 13392 paddd m5, m4 13393 paddd m0, m5 13394 paddd m7, m6 13395 paddd m0, m7 13396 paddd m0, m11 13397 13398 psrldq m1, m0, 8 13399 paddd m0, m1 13400 psrldq m1, m0, 4 13401 paddd m0, m1 13402 paddd m0, [pd_2] 13403 psrld m0, 2 13404 psubd m0, m8 13405 vextracti128 xm1, m0, 1 13406 psubd m0, m1 13407 pabsd m0, m0 13408%endmacro 13409 13410%if ARCH_X86_64 13411INIT_YMM avx2 13412cglobal psyCost_ss_8x8, 4, 7, 14 13413 ; NOTE: align stack to 64 bytes, so all of local data in same cache line 13414 mov r5, rsp 13415 sub rsp, 8*mmsize 13416 and rsp, ~63 13417 13418 mova m12, [pw_1] 13419 mova m13, [pw_pmpmpmpm] 13420 add r1, r1 13421 add r3, r3 13422 13423 PSY_SS_8x8 13424 13425 movd eax, xm0 13426 mov rsp, r5 13427 RET 13428%endif 13429 13430%if ARCH_X86_64 13431INIT_YMM avx2 13432cglobal psyCost_ss_16x16, 4, 9, 15 13433 ; NOTE: align stack to 64 bytes, so all of local data in same cache line 13434 mov r5, rsp 13435 sub rsp, 8*mmsize 13436 and rsp, ~63 13437 13438 mova m12, [pw_1] 13439 mova m13, [pw_pmpmpmpm] 13440 add r1, r1 13441 add r3, r3 13442 pxor m14, m14 13443 13444 mov r7d, 2 13445.loopH: 13446 mov r8d, 2 13447.loopW: 13448 PSY_SS_8x8 13449 13450 paddd m14, m0 13451 add r0, 16 13452 add r2, 16 13453 dec r8d 13454 jnz .loopW 13455 lea r0, [r0 + r1 * 8 - 32] 13456 lea r2, [r2 + r3 * 8 - 32] 13457 dec r7d 13458 jnz .loopH 13459 movd eax, xm14 13460 mov rsp, r5 13461 RET 13462%endif 13463 13464%if ARCH_X86_64 13465INIT_YMM avx2 13466cglobal psyCost_ss_32x32, 4, 9, 15 13467 ; NOTE: align stack to 64 bytes, so all of local data in same cache line 13468 mov r5, rsp 13469 sub rsp, 8*mmsize 13470 and rsp, ~63 13471 13472 mova m12, [pw_1] 13473 mova m13, [pw_pmpmpmpm] 13474 add r1, r1 13475 add r3, r3 13476 pxor m14, m14 13477 13478 mov r7d, 4 13479.loopH: 13480 mov r8d, 4 13481.loopW: 13482 PSY_SS_8x8 13483 13484 paddd m14, m0 13485 add r0, 16 13486 add r2, 16 13487 dec r8d 13488 jnz .loopW 13489 lea r0, [r0 + r1 * 8 - 64] 13490 lea r2, [r2 + r3 * 8 - 64] 13491 dec r7d 13492 jnz .loopH 13493 movd eax, xm14 13494 mov rsp, r5 13495 RET 13496%endif 13497 13498%if ARCH_X86_64 13499INIT_YMM avx2 13500cglobal psyCost_ss_64x64, 4, 9, 15 13501 ; NOTE: align stack to 64 bytes, so all of local data in same cache line 13502 mov r5, rsp 13503 sub rsp, 8*mmsize 13504 and rsp, ~63 13505 13506 mova m12, [pw_1] 13507 mova m13, [pw_pmpmpmpm] 13508 add r1, r1 13509 add r3, r3 13510 pxor m14, m14 13511 13512 mov r7d, 8 13513.loopH: 13514 mov r8d, 8 13515.loopW: 13516 PSY_SS_8x8 13517 13518 paddd m14, m0 13519 add r0, 16 13520 add r2, 16 13521 dec r8d 13522 jnz .loopW 13523 lea r0, [r0 + r1 * 8 - 128] 13524 lea r2, [r2 + r3 * 8 - 128] 13525 dec r7d 13526 jnz .loopH 13527 movd eax, xm14 13528 mov rsp, r5 13529 RET 13530%endif 13531 13532;;--------------------------------------------------------------- 13533;; SATD AVX2 13534;; int pixel_satd(const pixel*, intptr_t, const pixel*, intptr_t) 13535;;--------------------------------------------------------------- 13536;; r0 - pix0 13537;; r1 - pix0Stride 13538;; r2 - pix1 13539;; r3 - pix1Stride 13540 13541%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 13542INIT_YMM avx2 13543cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows 13544 pxor m6, m6 13545 vbroadcasti128 m0, [r0] 13546 vbroadcasti128 m4, [r2] 13547 vbroadcasti128 m1, [r0 + r1] 13548 vbroadcasti128 m5, [r2 + r3] 13549 pmaddubsw m4, m7 13550 pmaddubsw m0, m7 13551 pmaddubsw m5, m7 13552 pmaddubsw m1, m7 13553 psubw m0, m4 13554 psubw m1, m5 13555 vbroadcasti128 m2, [r0 + r1 * 2] 13556 vbroadcasti128 m4, [r2 + r3 * 2] 13557 vbroadcasti128 m3, [r0 + r4] 13558 vbroadcasti128 m5, [r2 + r5] 13559 pmaddubsw m4, m7 13560 pmaddubsw m2, m7 13561 pmaddubsw m5, m7 13562 pmaddubsw m3, m7 13563 psubw m2, m4 13564 psubw m3, m5 13565 lea r0, [r0 + r1 * 4] 13566 lea r2, [r2 + r3 * 4] 13567 paddw m4, m0, m1 13568 psubw m1, m1, m0 13569 paddw m0, m2, m3 13570 psubw m3, m2 13571 paddw m2, m4, m0 13572 psubw m0, m4 13573 paddw m4, m1, m3 13574 psubw m3, m1 13575 pabsw m2, m2 13576 pabsw m0, m0 13577 pabsw m4, m4 13578 pabsw m3, m3 13579 pblendw m1, m2, m0, 10101010b 13580 pslld m0, 16 13581 psrld m2, 16 13582 por m0, m2 13583 pmaxsw m1, m0 13584 paddw m6, m1 13585 pblendw m2, m4, m3, 10101010b 13586 pslld m3, 16 13587 psrld m4, 16 13588 por m3, m4 13589 pmaxsw m2, m3 13590 paddw m6, m2 13591 vbroadcasti128 m1, [r0] 13592 vbroadcasti128 m4, [r2] 13593 vbroadcasti128 m2, [r0 + r1] 13594 vbroadcasti128 m5, [r2 + r3] 13595 pmaddubsw m4, m7 13596 pmaddubsw m1, m7 13597 pmaddubsw m5, m7 13598 pmaddubsw m2, m7 13599 psubw m1, m4 13600 psubw m2, m5 13601 vbroadcasti128 m0, [r0 + r1 * 2] 13602 vbroadcasti128 m4, [r2 + r3 * 2] 13603 vbroadcasti128 m3, [r0 + r4] 13604 vbroadcasti128 m5, [r2 + r5] 13605 lea r0, [r0 + r1 * 4] 13606 lea r2, [r2 + r3 * 4] 13607 pmaddubsw m4, m7 13608 pmaddubsw m0, m7 13609 pmaddubsw m5, m7 13610 pmaddubsw m3, m7 13611 psubw m0, m4 13612 psubw m3, m5 13613 paddw m4, m1, m2 13614 psubw m2, m1 13615 paddw m1, m0, m3 13616 psubw m3, m0 13617 paddw m0, m4, m1 13618 psubw m1, m4 13619 paddw m4, m2, m3 13620 psubw m3, m2 13621 pabsw m0, m0 13622 pabsw m1, m1 13623 pabsw m4, m4 13624 pabsw m3, m3 13625 pblendw m2, m0, m1, 10101010b 13626 pslld m1, 16 13627 psrld m0, 16 13628 por m1, m0 13629 pmaxsw m2, m1 13630 paddw m6, m2 13631 pblendw m0, m4, m3, 10101010b 13632 pslld m3, 16 13633 psrld m4, 16 13634 por m3, m4 13635 pmaxsw m0, m3 13636 paddw m6, m0 13637 vextracti128 xm0, m6, 1 13638 pmovzxwd m6, xm6 13639 pmovzxwd m0, xm0 13640 paddd m8, m6 13641 paddd m9, m0 13642 ret 13643 13644cglobal calc_satd_16x4 ; function to compute satd cost for 16 columns, 4 rows 13645 pxor m6, m6 13646 vbroadcasti128 m0, [r0] 13647 vbroadcasti128 m4, [r2] 13648 vbroadcasti128 m1, [r0 + r1] 13649 vbroadcasti128 m5, [r2 + r3] 13650 pmaddubsw m4, m7 13651 pmaddubsw m0, m7 13652 pmaddubsw m5, m7 13653 pmaddubsw m1, m7 13654 psubw m0, m4 13655 psubw m1, m5 13656 vbroadcasti128 m2, [r0 + r1 * 2] 13657 vbroadcasti128 m4, [r2 + r3 * 2] 13658 vbroadcasti128 m3, [r0 + r4] 13659 vbroadcasti128 m5, [r2 + r5] 13660 pmaddubsw m4, m7 13661 pmaddubsw m2, m7 13662 pmaddubsw m5, m7 13663 pmaddubsw m3, m7 13664 psubw m2, m4 13665 psubw m3, m5 13666 paddw m4, m0, m1 13667 psubw m1, m1, m0 13668 paddw m0, m2, m3 13669 psubw m3, m2 13670 paddw m2, m4, m0 13671 psubw m0, m4 13672 paddw m4, m1, m3 13673 psubw m3, m1 13674 pabsw m2, m2 13675 pabsw m0, m0 13676 pabsw m4, m4 13677 pabsw m3, m3 13678 pblendw m1, m2, m0, 10101010b 13679 pslld m0, 16 13680 psrld m2, 16 13681 por m0, m2 13682 pmaxsw m1, m0 13683 paddw m6, m1 13684 pblendw m2, m4, m3, 10101010b 13685 pslld m3, 16 13686 psrld m4, 16 13687 por m3, m4 13688 pmaxsw m2, m3 13689 paddw m6, m2 13690 vextracti128 xm0, m6, 1 13691 pmovzxwd m6, xm6 13692 pmovzxwd m0, xm0 13693 paddd m8, m6 13694 paddd m9, m0 13695 ret 13696 13697cglobal pixel_satd_16x4, 4,6,10 ; if WIN64 && cpuflag(avx2) 13698 mova m7, [hmul_16p] 13699 lea r4, [3 * r1] 13700 lea r5, [3 * r3] 13701 pxor m8, m8 13702 pxor m9, m9 13703 13704 call calc_satd_16x4 13705 13706 paddd m8, m9 13707 vextracti128 xm0, m8, 1 13708 paddd xm0, xm8 13709 movhlps xm1, xm0 13710 paddd xm0, xm1 13711 pshuflw xm1, xm0, q0032 13712 paddd xm0, xm1 13713 movd eax, xm0 13714 RET 13715 13716cglobal pixel_satd_16x12, 4,6,10 ; if WIN64 && cpuflag(avx2) 13717 mova m7, [hmul_16p] 13718 lea r4, [3 * r1] 13719 lea r5, [3 * r3] 13720 pxor m8, m8 13721 pxor m9, m9 13722 13723 call calc_satd_16x8 13724 call calc_satd_16x4 13725 13726 paddd m8, m9 13727 vextracti128 xm0, m8, 1 13728 paddd xm0, xm8 13729 movhlps xm1, xm0 13730 paddd xm0, xm1 13731 pshuflw xm1, xm0, q0032 13732 paddd xm0, xm1 13733 movd eax, xm0 13734 RET 13735 13736cglobal pixel_satd_16x32, 4,6,10 ; if WIN64 && cpuflag(avx2) 13737 mova m7, [hmul_16p] 13738 lea r4, [3 * r1] 13739 lea r5, [3 * r3] 13740 pxor m8, m8 13741 pxor m9, m9 13742 13743 call calc_satd_16x8 13744 call calc_satd_16x8 13745 call calc_satd_16x8 13746 call calc_satd_16x8 13747 13748 paddd m8, m9 13749 vextracti128 xm0, m8, 1 13750 paddd xm0, xm8 13751 movhlps xm1, xm0 13752 paddd xm0, xm1 13753 pshuflw xm1, xm0, q0032 13754 paddd xm0, xm1 13755 movd eax, xm0 13756 RET 13757 13758cglobal pixel_satd_16x64, 4,6,10 ; if WIN64 && cpuflag(avx2) 13759 mova m7, [hmul_16p] 13760 lea r4, [3 * r1] 13761 lea r5, [3 * r3] 13762 pxor m8, m8 13763 pxor m9, m9 13764 13765 call calc_satd_16x8 13766 call calc_satd_16x8 13767 call calc_satd_16x8 13768 call calc_satd_16x8 13769 call calc_satd_16x8 13770 call calc_satd_16x8 13771 call calc_satd_16x8 13772 call calc_satd_16x8 13773 13774 paddd m8, m9 13775 vextracti128 xm0, m8, 1 13776 paddd xm0, xm8 13777 movhlps xm1, xm0 13778 paddd xm0, xm1 13779 pshuflw xm1, xm0, q0032 13780 paddd xm0, xm1 13781 movd eax, xm0 13782 RET 13783 13784cglobal pixel_satd_32x8, 4,8,10 ; if WIN64 && cpuflag(avx2) 13785 mova m7, [hmul_16p] 13786 lea r4, [3 * r1] 13787 lea r5, [3 * r3] 13788 pxor m8, m8 13789 pxor m9, m9 13790 mov r6, r0 13791 mov r7, r2 13792 13793 call calc_satd_16x8 13794 13795 lea r0, [r6 + 16] 13796 lea r2, [r7 + 16] 13797 13798 call calc_satd_16x8 13799 13800 paddd m8, m9 13801 vextracti128 xm0, m8, 1 13802 paddd xm0, xm8 13803 movhlps xm1, xm0 13804 paddd xm0, xm1 13805 pshuflw xm1, xm0, q0032 13806 paddd xm0, xm1 13807 movd eax, xm0 13808 RET 13809 13810cglobal pixel_satd_32x16, 4,8,10 ; if WIN64 && cpuflag(avx2) 13811 mova m7, [hmul_16p] 13812 lea r4, [3 * r1] 13813 lea r5, [3 * r3] 13814 pxor m8, m8 13815 pxor m9, m9 13816 mov r6, r0 13817 mov r7, r2 13818 13819 call calc_satd_16x8 13820 call calc_satd_16x8 13821 13822 lea r0, [r6 + 16] 13823 lea r2, [r7 + 16] 13824 13825 call calc_satd_16x8 13826 call calc_satd_16x8 13827 13828 paddd m8, m9 13829 vextracti128 xm0, m8, 1 13830 paddd xm0, xm8 13831 movhlps xm1, xm0 13832 paddd xm0, xm1 13833 pshuflw xm1, xm0, q0032 13834 paddd xm0, xm1 13835 movd eax, xm0 13836 RET 13837 13838cglobal pixel_satd_32x24, 4,8,10 ; if WIN64 && cpuflag(avx2) 13839 mova m7, [hmul_16p] 13840 lea r4, [3 * r1] 13841 lea r5, [3 * r3] 13842 pxor m8, m8 13843 pxor m9, m9 13844 mov r6, r0 13845 mov r7, r2 13846 13847 call calc_satd_16x8 13848 call calc_satd_16x8 13849 call calc_satd_16x8 13850 13851 lea r0, [r6 + 16] 13852 lea r2, [r7 + 16] 13853 13854 call calc_satd_16x8 13855 call calc_satd_16x8 13856 call calc_satd_16x8 13857 13858 paddd m8, m9 13859 vextracti128 xm0, m8, 1 13860 paddd xm0, xm8 13861 movhlps xm1, xm0 13862 paddd xm0, xm1 13863 pshuflw xm1, xm0, q0032 13864 paddd xm0, xm1 13865 movd eax, xm0 13866 RET 13867 13868cglobal pixel_satd_32x32, 4,8,10 ; if WIN64 && cpuflag(avx2) 13869 mova m7, [hmul_16p] 13870 lea r4, [3 * r1] 13871 lea r5, [3 * r3] 13872 pxor m8, m8 13873 pxor m9, m9 13874 mov r6, r0 13875 mov r7, r2 13876 13877 call calc_satd_16x8 13878 call calc_satd_16x8 13879 call calc_satd_16x8 13880 call calc_satd_16x8 13881 13882 lea r0, [r6 + 16] 13883 lea r2, [r7 + 16] 13884 13885 call calc_satd_16x8 13886 call calc_satd_16x8 13887 call calc_satd_16x8 13888 call calc_satd_16x8 13889 13890 paddd m8, m9 13891 vextracti128 xm0, m8, 1 13892 paddd xm0, xm8 13893 movhlps xm1, xm0 13894 paddd xm0, xm1 13895 pshuflw xm1, xm0, q0032 13896 paddd xm0, xm1 13897 movd eax, xm0 13898 RET 13899 13900cglobal pixel_satd_32x64, 4,8,10 ; if WIN64 && cpuflag(avx2) 13901 mova m7, [hmul_16p] 13902 lea r4, [3 * r1] 13903 lea r5, [3 * r3] 13904 pxor m8, m8 13905 pxor m9, m9 13906 mov r6, r0 13907 mov r7, r2 13908 13909 call calc_satd_16x8 13910 call calc_satd_16x8 13911 call calc_satd_16x8 13912 call calc_satd_16x8 13913 call calc_satd_16x8 13914 call calc_satd_16x8 13915 call calc_satd_16x8 13916 call calc_satd_16x8 13917 13918 lea r0, [r6 + 16] 13919 lea r2, [r7 + 16] 13920 13921 call calc_satd_16x8 13922 call calc_satd_16x8 13923 call calc_satd_16x8 13924 call calc_satd_16x8 13925 call calc_satd_16x8 13926 call calc_satd_16x8 13927 call calc_satd_16x8 13928 call calc_satd_16x8 13929 13930 paddd m8, m9 13931 vextracti128 xm0, m8, 1 13932 paddd xm0, xm8 13933 movhlps xm1, xm0 13934 paddd xm0, xm1 13935 pshuflw xm1, xm0, q0032 13936 paddd xm0, xm1 13937 movd eax, xm0 13938 RET 13939 13940cglobal pixel_satd_48x64, 4,8,10 ; if WIN64 && cpuflag(avx2) 13941 mova m7, [hmul_16p] 13942 lea r4, [3 * r1] 13943 lea r5, [3 * r3] 13944 pxor m8, m8 13945 pxor m9, m9 13946 mov r6, r0 13947 mov r7, r2 13948 13949 call calc_satd_16x8 13950 call calc_satd_16x8 13951 call calc_satd_16x8 13952 call calc_satd_16x8 13953 call calc_satd_16x8 13954 call calc_satd_16x8 13955 call calc_satd_16x8 13956 call calc_satd_16x8 13957 lea r0, [r6 + 16] 13958 lea r2, [r7 + 16] 13959 call calc_satd_16x8 13960 call calc_satd_16x8 13961 call calc_satd_16x8 13962 call calc_satd_16x8 13963 call calc_satd_16x8 13964 call calc_satd_16x8 13965 call calc_satd_16x8 13966 call calc_satd_16x8 13967 lea r0, [r6 + 32] 13968 lea r2, [r7 + 32] 13969 call calc_satd_16x8 13970 call calc_satd_16x8 13971 call calc_satd_16x8 13972 call calc_satd_16x8 13973 call calc_satd_16x8 13974 call calc_satd_16x8 13975 call calc_satd_16x8 13976 call calc_satd_16x8 13977 13978 paddd m8, m9 13979 vextracti128 xm0, m8, 1 13980 paddd xm0, xm8 13981 movhlps xm1, xm0 13982 paddd xm0, xm1 13983 pshuflw xm1, xm0, q0032 13984 paddd xm0, xm1 13985 movd eax, xm0 13986 RET 13987 13988cglobal pixel_satd_64x16, 4,8,10 ; if WIN64 && cpuflag(avx2) 13989 mova m7, [hmul_16p] 13990 lea r4, [3 * r1] 13991 lea r5, [3 * r3] 13992 pxor m8, m8 13993 pxor m9, m9 13994 mov r6, r0 13995 mov r7, r2 13996 13997 call calc_satd_16x8 13998 call calc_satd_16x8 13999 lea r0, [r6 + 16] 14000 lea r2, [r7 + 16] 14001 call calc_satd_16x8 14002 call calc_satd_16x8 14003 lea r0, [r6 + 32] 14004 lea r2, [r7 + 32] 14005 call calc_satd_16x8 14006 call calc_satd_16x8 14007 lea r0, [r6 + 48] 14008 lea r2, [r7 + 48] 14009 call calc_satd_16x8 14010 call calc_satd_16x8 14011 14012 paddd m8, m9 14013 vextracti128 xm0, m8, 1 14014 paddd xm0, xm8 14015 movhlps xm1, xm0 14016 paddd xm0, xm1 14017 pshuflw xm1, xm0, q0032 14018 paddd xm0, xm1 14019 movd eax, xm0 14020 RET 14021 14022cglobal pixel_satd_64x32, 4,8,10 ; if WIN64 && cpuflag(avx2) 14023 mova m7, [hmul_16p] 14024 lea r4, [3 * r1] 14025 lea r5, [3 * r3] 14026 pxor m8, m8 14027 pxor m9, m9 14028 mov r6, r0 14029 mov r7, r2 14030 14031 call calc_satd_16x8 14032 call calc_satd_16x8 14033 call calc_satd_16x8 14034 call calc_satd_16x8 14035 lea r0, [r6 + 16] 14036 lea r2, [r7 + 16] 14037 call calc_satd_16x8 14038 call calc_satd_16x8 14039 call calc_satd_16x8 14040 call calc_satd_16x8 14041 lea r0, [r6 + 32] 14042 lea r2, [r7 + 32] 14043 call calc_satd_16x8 14044 call calc_satd_16x8 14045 call calc_satd_16x8 14046 call calc_satd_16x8 14047 lea r0, [r6 + 48] 14048 lea r2, [r7 + 48] 14049 call calc_satd_16x8 14050 call calc_satd_16x8 14051 call calc_satd_16x8 14052 call calc_satd_16x8 14053 14054 paddd m8, m9 14055 vextracti128 xm0, m8, 1 14056 paddd xm0, xm8 14057 movhlps xm1, xm0 14058 paddd xm0, xm1 14059 pshuflw xm1, xm0, q0032 14060 paddd xm0, xm1 14061 movd eax, xm0 14062 RET 14063 14064cglobal pixel_satd_64x48, 4,8,10 ; if WIN64 && cpuflag(avx2) 14065 mova m7, [hmul_16p] 14066 lea r4, [3 * r1] 14067 lea r5, [3 * r3] 14068 pxor m8, m8 14069 pxor m9, m9 14070 mov r6, r0 14071 mov r7, r2 14072 14073 call calc_satd_16x8 14074 call calc_satd_16x8 14075 call calc_satd_16x8 14076 call calc_satd_16x8 14077 call calc_satd_16x8 14078 call calc_satd_16x8 14079 lea r0, [r6 + 16] 14080 lea r2, [r7 + 16] 14081 call calc_satd_16x8 14082 call calc_satd_16x8 14083 call calc_satd_16x8 14084 call calc_satd_16x8 14085 call calc_satd_16x8 14086 call calc_satd_16x8 14087 lea r0, [r6 + 32] 14088 lea r2, [r7 + 32] 14089 call calc_satd_16x8 14090 call calc_satd_16x8 14091 call calc_satd_16x8 14092 call calc_satd_16x8 14093 call calc_satd_16x8 14094 call calc_satd_16x8 14095 lea r0, [r6 + 48] 14096 lea r2, [r7 + 48] 14097 call calc_satd_16x8 14098 call calc_satd_16x8 14099 call calc_satd_16x8 14100 call calc_satd_16x8 14101 call calc_satd_16x8 14102 call calc_satd_16x8 14103 14104 paddd m8, m9 14105 vextracti128 xm0, m8, 1 14106 paddd xm0, xm8 14107 movhlps xm1, xm0 14108 paddd xm0, xm1 14109 pshuflw xm1, xm0, q0032 14110 paddd xm0, xm1 14111 movd eax, xm0 14112 RET 14113 14114cglobal pixel_satd_64x64, 4,8,10 ; if WIN64 && cpuflag(avx2) 14115 mova m7, [hmul_16p] 14116 lea r4, [3 * r1] 14117 lea r5, [3 * r3] 14118 pxor m8, m8 14119 pxor m9, m9 14120 mov r6, r0 14121 mov r7, r2 14122 14123 call calc_satd_16x8 14124 call calc_satd_16x8 14125 call calc_satd_16x8 14126 call calc_satd_16x8 14127 call calc_satd_16x8 14128 call calc_satd_16x8 14129 call calc_satd_16x8 14130 call calc_satd_16x8 14131 lea r0, [r6 + 16] 14132 lea r2, [r7 + 16] 14133 call calc_satd_16x8 14134 call calc_satd_16x8 14135 call calc_satd_16x8 14136 call calc_satd_16x8 14137 call calc_satd_16x8 14138 call calc_satd_16x8 14139 call calc_satd_16x8 14140 call calc_satd_16x8 14141 lea r0, [r6 + 32] 14142 lea r2, [r7 + 32] 14143 call calc_satd_16x8 14144 call calc_satd_16x8 14145 call calc_satd_16x8 14146 call calc_satd_16x8 14147 call calc_satd_16x8 14148 call calc_satd_16x8 14149 call calc_satd_16x8 14150 call calc_satd_16x8 14151 lea r0, [r6 + 48] 14152 lea r2, [r7 + 48] 14153 call calc_satd_16x8 14154 call calc_satd_16x8 14155 call calc_satd_16x8 14156 call calc_satd_16x8 14157 call calc_satd_16x8 14158 call calc_satd_16x8 14159 call calc_satd_16x8 14160 call calc_satd_16x8 14161 14162 paddd m8, m9 14163 vextracti128 xm0, m8, 1 14164 paddd xm0, xm8 14165 movhlps xm1, xm0 14166 paddd xm0, xm1 14167 pshuflw xm1, xm0, q0032 14168 paddd xm0, xm1 14169 movd eax, xm0 14170 RET 14171 14172%macro PROCESS_SATD_32x4_AVX512 0 ; function to compute satd cost for 32 columns, 4 rows 14173 ; rows 0-3 14174 pmovzxbw m0, [r0] 14175 pmovzxbw m4, [r2] 14176 psubw m0, m4 14177 pmovzxbw m1, [r0 + r1] 14178 pmovzxbw m5, [r2 + r3] 14179 psubw m1, m5 14180 pmovzxbw m2, [r0 + r1 * 2] 14181 pmovzxbw m4, [r2 + r3 * 2] 14182 psubw m2, m4 14183 pmovzxbw m3, [r0 + r4] 14184 pmovzxbw m5, [r2 + r5] 14185 psubw m3, m5 14186 paddw m4, m0, m1 14187 psubw m1, m0 14188 paddw m0, m2, m3 14189 psubw m3, m2 14190 punpckhwd m2, m4, m1 14191 punpcklwd m4, m1 14192 punpckhwd m1, m0, m3 14193 punpcklwd m0, m3 14194 paddw m3, m4, m0 14195 psubw m0, m4 14196 paddw m4, m2, m1 14197 psubw m1, m2 14198 punpckhdq m2, m3, m0 14199 punpckldq m3, m0 14200 paddw m0, m3, m2 14201 psubw m2, m3 14202 punpckhdq m3, m4, m1 14203 punpckldq m4, m1 14204 paddw m1, m4, m3 14205 psubw m3, m4 14206 punpckhqdq m4, m0, m1 14207 punpcklqdq m0, m1 14208 pabsw m0, m0 14209 pabsw m4, m4 14210 pmaxsw m0, m0, m4 14211 punpckhqdq m1, m2, m3 14212 punpcklqdq m2, m3 14213 pabsw m2, m2 14214 pabsw m1, m1 14215 pmaxsw m2, m1 14216 pxor m7, m7 14217 mova m1, m0 14218 punpcklwd m1, m7 14219 paddd m6, m1 14220 mova m1, m0 14221 punpckhwd m1, m7 14222 paddd m6, m1 14223 pxor m7, m7 14224 mova m1, m2 14225 punpcklwd m1, m7 14226 paddd m6, m1 14227 mova m1, m2 14228 punpckhwd m1, m7 14229 paddd m6, m1 14230%endmacro 14231 14232%macro SATD_MAIN_AVX512_END 0 14233 vextracti32x8 ym7, m6, 1 14234 paddd ym6, ym7 14235 vextracti128 xm7, ym6, 1 14236 paddd xm6, xm6, xm7 14237 punpckhqdq xm7, xm6, xm6 14238 paddd xm6, xm7 14239 movq rax, xm6 14240 rorx rdx, rax, 32 14241 add eax, edx 14242%endmacro 14243 14244%macro SATD_32xN_AVX512 1 14245INIT_ZMM avx512 14246cglobal pixel_satd_32x%1, 4,6,8 14247 lea r4, [3 * r1] 14248 lea r5, [3 * r3] 14249 pxor m6, m6 14250%rep %1/4 - 1 14251 PROCESS_SATD_32x4_AVX512 14252 lea r0, [r0 + 4 * r1] 14253 lea r2, [r2 + 4 * r3] 14254%endrep 14255 PROCESS_SATD_32x4_AVX512 14256 SATD_MAIN_AVX512_END 14257 RET 14258%endmacro 14259 14260SATD_32xN_AVX512 8 14261SATD_32xN_AVX512 16 14262SATD_32xN_AVX512 24 14263SATD_32xN_AVX512 32 14264SATD_32xN_AVX512 48 14265SATD_32xN_AVX512 64 14266 14267%macro SATD_64xN_AVX512 1 14268INIT_ZMM avx512 14269cglobal pixel_satd_64x%1, 4,8,8 14270 lea r4, [3 * r1] 14271 lea r5, [3 * r3] 14272 pxor m6, m6 14273 mov r6, r0 14274 mov r7, r2 14275 14276%rep %1/4 - 1 14277 PROCESS_SATD_32x4_AVX512 14278 lea r0, [r0 + 4 * r1] 14279 lea r2, [r2 + 4 * r3] 14280%endrep 14281 PROCESS_SATD_32x4_AVX512 14282 lea r0, [r6 + mmsize/2] 14283 lea r2, [r7 + mmsize/2] 14284%rep %1/4 - 1 14285 PROCESS_SATD_32x4_AVX512 14286 lea r0, [r0 + 4 * r1] 14287 lea r2, [r2 + 4 * r3] 14288%endrep 14289 PROCESS_SATD_32x4_AVX512 14290 SATD_MAIN_AVX512_END 14291 RET 14292%endmacro 14293 14294SATD_64xN_AVX512 16 14295SATD_64xN_AVX512 32 14296SATD_64xN_AVX512 48 14297SATD_64xN_AVX512 64 14298%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 14299%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1 14300INIT_YMM avx2 14301cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows 14302 ; rows 0-3 14303 movu m0, [r0] 14304 movu m4, [r2] 14305 psubw m0, m4 14306 movu m1, [r0 + r1] 14307 movu m5, [r2 + r3] 14308 psubw m1, m5 14309 movu m2, [r0 + r1 * 2] 14310 movu m4, [r2 + r3 * 2] 14311 psubw m2, m4 14312 movu m3, [r0 + r4] 14313 movu m5, [r2 + r5] 14314 psubw m3, m5 14315 lea r0, [r0 + r1 * 4] 14316 lea r2, [r2 + r3 * 4] 14317 paddw m4, m0, m1 14318 psubw m1, m0 14319 paddw m0, m2, m3 14320 psubw m3, m2 14321 punpckhwd m2, m4, m1 14322 punpcklwd m4, m1 14323 punpckhwd m1, m0, m3 14324 punpcklwd m0, m3 14325 paddw m3, m4, m0 14326 psubw m0, m4 14327 paddw m4, m2, m1 14328 psubw m1, m2 14329 punpckhdq m2, m3, m0 14330 punpckldq m3, m0 14331 paddw m0, m3, m2 14332 psubw m2, m3 14333 punpckhdq m3, m4, m1 14334 punpckldq m4, m1 14335 paddw m1, m4, m3 14336 psubw m3, m4 14337 punpckhqdq m4, m0, m1 14338 punpcklqdq m0, m1 14339 pabsw m0, m0 14340 pabsw m4, m4 14341 pmaxsw m0, m0, m4 14342 punpckhqdq m1, m2, m3 14343 punpcklqdq m2, m3 14344 pabsw m2, m2 14345 pabsw m1, m1 14346 pmaxsw m2, m1 14347 pxor m7, m7 14348 mova m1, m0 14349 punpcklwd m1, m7 14350 paddd m6, m1 14351 mova m1, m0 14352 punpckhwd m1, m7 14353 paddd m6, m1 14354 pxor m7, m7 14355 mova m1, m2 14356 punpcklwd m1, m7 14357 paddd m6, m1 14358 mova m1, m2 14359 punpckhwd m1, m7 14360 paddd m6, m1 14361 ; rows 4-7 14362 movu m0, [r0] 14363 movu m4, [r2] 14364 psubw m0, m4 14365 movu m1, [r0 + r1] 14366 movu m5, [r2 + r3] 14367 psubw m1, m5 14368 movu m2, [r0 + r1 * 2] 14369 movu m4, [r2 + r3 * 2] 14370 psubw m2, m4 14371 movu m3, [r0 + r4] 14372 movu m5, [r2 + r5] 14373 psubw m3, m5 14374 lea r0, [r0 + r1 * 4] 14375 lea r2, [r2 + r3 * 4] 14376 paddw m4, m0, m1 14377 psubw m1, m0 14378 paddw m0, m2, m3 14379 psubw m3, m2 14380 punpckhwd m2, m4, m1 14381 punpcklwd m4, m1 14382 punpckhwd m1, m0, m3 14383 punpcklwd m0, m3 14384 paddw m3, m4, m0 14385 psubw m0, m4 14386 paddw m4, m2, m1 14387 psubw m1, m2 14388 punpckhdq m2, m3, m0 14389 punpckldq m3, m0 14390 paddw m0, m3, m2 14391 psubw m2, m3 14392 punpckhdq m3, m4, m1 14393 punpckldq m4, m1 14394 paddw m1, m4, m3 14395 psubw m3, m4 14396 punpckhqdq m4, m0, m1 14397 punpcklqdq m0, m1 14398 pabsw m0, m0 14399 pabsw m4, m4 14400 pmaxsw m0, m0, m4 14401 punpckhqdq m1, m2, m3 14402 punpcklqdq m2, m3 14403 pabsw m2, m2 14404 pabsw m1, m1 14405 pmaxsw m2, m1 14406 pxor m7, m7 14407 mova m1, m0 14408 punpcklwd m1, m7 14409 paddd m6, m1 14410 mova m1, m0 14411 punpckhwd m1, m7 14412 paddd m6, m1 14413 pxor m7, m7 14414 mova m1, m2 14415 punpcklwd m1, m7 14416 paddd m6, m1 14417 mova m1, m2 14418 punpckhwd m1, m7 14419 paddd m6, m1 14420 ret 14421 14422cglobal calc_satd_16x4 ; function to compute satd cost for 16 columns, 4 rows 14423 ; rows 0-3 14424 movu m0, [r0] 14425 movu m4, [r2] 14426 psubw m0, m4 14427 movu m1, [r0 + r1] 14428 movu m5, [r2 + r3] 14429 psubw m1, m5 14430 movu m2, [r0 + r1 * 2] 14431 movu m4, [r2 + r3 * 2] 14432 psubw m2, m4 14433 movu m3, [r0 + r4] 14434 movu m5, [r2 + r5] 14435 psubw m3, m5 14436 lea r0, [r0 + r1 * 4] 14437 lea r2, [r2 + r3 * 4] 14438 paddw m4, m0, m1 14439 psubw m1, m0 14440 paddw m0, m2, m3 14441 psubw m3, m2 14442 punpckhwd m2, m4, m1 14443 punpcklwd m4, m1 14444 punpckhwd m1, m0, m3 14445 punpcklwd m0, m3 14446 paddw m3, m4, m0 14447 psubw m0, m4 14448 paddw m4, m2, m1 14449 psubw m1, m2 14450 punpckhdq m2, m3, m0 14451 punpckldq m3, m0 14452 paddw m0, m3, m2 14453 psubw m2, m3 14454 punpckhdq m3, m4, m1 14455 punpckldq m4, m1 14456 paddw m1, m4, m3 14457 psubw m3, m4 14458 punpckhqdq m4, m0, m1 14459 punpcklqdq m0, m1 14460 pabsw m0, m0 14461 pabsw m4, m4 14462 pmaxsw m0, m0, m4 14463 punpckhqdq m1, m2, m3 14464 punpcklqdq m2, m3 14465 pabsw m2, m2 14466 pabsw m1, m1 14467 pmaxsw m2, m1 14468 pxor m7, m7 14469 mova m1, m0 14470 punpcklwd m1, m7 14471 paddd m6, m1 14472 mova m1, m0 14473 punpckhwd m1, m7 14474 paddd m6, m1 14475 pxor m7, m7 14476 mova m1, m2 14477 punpcklwd m1, m7 14478 paddd m6, m1 14479 mova m1, m2 14480 punpckhwd m1, m7 14481 paddd m6, m1 14482 ret 14483 14484cglobal pixel_satd_16x4, 4,6,8 14485 add r1d, r1d 14486 add r3d, r3d 14487 lea r4, [3 * r1] 14488 lea r5, [3 * r3] 14489 pxor m6, m6 14490 14491 call calc_satd_16x4 14492 14493 vextracti128 xm7, m6, 1 14494 paddd xm6, xm7 14495 pxor xm7, xm7 14496 movhlps xm7, xm6 14497 paddd xm6, xm7 14498 pshufd xm7, xm6, 1 14499 paddd xm6, xm7 14500 movd eax, xm6 14501 RET 14502 14503cglobal pixel_satd_16x8, 4,6,8 14504 add r1d, r1d 14505 add r3d, r3d 14506 lea r4, [3 * r1] 14507 lea r5, [3 * r3] 14508 pxor m6, m6 14509 14510 call calc_satd_16x8 14511 14512 vextracti128 xm7, m6, 1 14513 paddd xm6, xm7 14514 pxor xm7, xm7 14515 movhlps xm7, xm6 14516 paddd xm6, xm7 14517 pshufd xm7, xm6, 1 14518 paddd xm6, xm7 14519 movd eax, xm6 14520 RET 14521 14522cglobal pixel_satd_16x12, 4,6,8 14523 add r1d, r1d 14524 add r3d, r3d 14525 lea r4, [3 * r1] 14526 lea r5, [3 * r3] 14527 pxor m6, m6 14528 14529 call calc_satd_16x8 14530 call calc_satd_16x4 14531 14532 vextracti128 xm7, m6, 1 14533 paddd xm6, xm7 14534 pxor xm7, xm7 14535 movhlps xm7, xm6 14536 paddd xm6, xm7 14537 pshufd xm7, xm6, 1 14538 paddd xm6, xm7 14539 movd eax, xm6 14540 RET 14541 14542cglobal pixel_satd_16x16, 4,6,8 14543 add r1d, r1d 14544 add r3d, r3d 14545 lea r4, [3 * r1] 14546 lea r5, [3 * r3] 14547 pxor m6, m6 14548 14549 call calc_satd_16x8 14550 call calc_satd_16x8 14551 14552 vextracti128 xm7, m6, 1 14553 paddd xm6, xm7 14554 pxor xm7, xm7 14555 movhlps xm7, xm6 14556 paddd xm6, xm7 14557 pshufd xm7, xm6, 1 14558 paddd xm6, xm7 14559 movd eax, xm6 14560 RET 14561 14562cglobal pixel_satd_16x32, 4,6,8 14563 add r1d, r1d 14564 add r3d, r3d 14565 lea r4, [3 * r1] 14566 lea r5, [3 * r3] 14567 pxor m6, m6 14568 14569 call calc_satd_16x8 14570 call calc_satd_16x8 14571 call calc_satd_16x8 14572 call calc_satd_16x8 14573 14574 vextracti128 xm7, m6, 1 14575 paddd xm6, xm7 14576 pxor xm7, xm7 14577 movhlps xm7, xm6 14578 paddd xm6, xm7 14579 pshufd xm7, xm6, 1 14580 paddd xm6, xm7 14581 movd eax, xm6 14582 RET 14583 14584cglobal pixel_satd_16x64, 4,6,8 14585 add r1d, r1d 14586 add r3d, r3d 14587 lea r4, [3 * r1] 14588 lea r5, [3 * r3] 14589 pxor m6, m6 14590 14591 call calc_satd_16x8 14592 call calc_satd_16x8 14593 call calc_satd_16x8 14594 call calc_satd_16x8 14595 call calc_satd_16x8 14596 call calc_satd_16x8 14597 call calc_satd_16x8 14598 call calc_satd_16x8 14599 14600 vextracti128 xm7, m6, 1 14601 paddd xm6, xm7 14602 pxor xm7, xm7 14603 movhlps xm7, xm6 14604 paddd xm6, xm7 14605 pshufd xm7, xm6, 1 14606 paddd xm6, xm7 14607 movd eax, xm6 14608 RET 14609 14610cglobal pixel_satd_32x8, 4,8,8 14611 add r1d, r1d 14612 add r3d, r3d 14613 lea r4, [3 * r1] 14614 lea r5, [3 * r3] 14615 pxor m6, m6 14616 mov r6, r0 14617 mov r7, r2 14618 14619 call calc_satd_16x8 14620 14621 lea r0, [r6 + 32] 14622 lea r2, [r7 + 32] 14623 14624 call calc_satd_16x8 14625 14626 vextracti128 xm7, m6, 1 14627 paddd xm6, xm7 14628 pxor xm7, xm7 14629 movhlps xm7, xm6 14630 paddd xm6, xm7 14631 pshufd xm7, xm6, 1 14632 paddd xm6, xm7 14633 movd eax, xm6 14634 RET 14635 14636cglobal pixel_satd_32x16, 4,8,8 14637 add r1d, r1d 14638 add r3d, r3d 14639 lea r4, [3 * r1] 14640 lea r5, [3 * r3] 14641 pxor m6, m6 14642 mov r6, r0 14643 mov r7, r2 14644 14645 call calc_satd_16x8 14646 call calc_satd_16x8 14647 14648 lea r0, [r6 + 32] 14649 lea r2, [r7 + 32] 14650 14651 call calc_satd_16x8 14652 call calc_satd_16x8 14653 14654 vextracti128 xm7, m6, 1 14655 paddd xm6, xm7 14656 pxor xm7, xm7 14657 movhlps xm7, xm6 14658 paddd xm6, xm7 14659 pshufd xm7, xm6, 1 14660 paddd xm6, xm7 14661 movd eax, xm6 14662 RET 14663 14664cglobal pixel_satd_32x24, 4,8,8 14665 add r1d, r1d 14666 add r3d, r3d 14667 lea r4, [3 * r1] 14668 lea r5, [3 * r3] 14669 pxor m6, m6 14670 mov r6, r0 14671 mov r7, r2 14672 14673 call calc_satd_16x8 14674 call calc_satd_16x8 14675 call calc_satd_16x8 14676 14677 lea r0, [r6 + 32] 14678 lea r2, [r7 + 32] 14679 14680 call calc_satd_16x8 14681 call calc_satd_16x8 14682 call calc_satd_16x8 14683 14684 vextracti128 xm7, m6, 1 14685 paddd xm6, xm7 14686 pxor xm7, xm7 14687 movhlps xm7, xm6 14688 paddd xm6, xm7 14689 pshufd xm7, xm6, 1 14690 paddd xm6, xm7 14691 movd eax, xm6 14692 RET 14693 14694cglobal pixel_satd_32x32, 4,8,8 14695 add r1d, r1d 14696 add r3d, r3d 14697 lea r4, [3 * r1] 14698 lea r5, [3 * r3] 14699 pxor m6, m6 14700 mov r6, r0 14701 mov r7, r2 14702 14703 call calc_satd_16x8 14704 call calc_satd_16x8 14705 call calc_satd_16x8 14706 call calc_satd_16x8 14707 14708 lea r0, [r6 + 32] 14709 lea r2, [r7 + 32] 14710 14711 call calc_satd_16x8 14712 call calc_satd_16x8 14713 call calc_satd_16x8 14714 call calc_satd_16x8 14715 14716 vextracti128 xm7, m6, 1 14717 paddd xm6, xm7 14718 pxor xm7, xm7 14719 movhlps xm7, xm6 14720 paddd xm6, xm7 14721 pshufd xm7, xm6, 1 14722 paddd xm6, xm7 14723 movd eax, xm6 14724 RET 14725 14726cglobal pixel_satd_32x64, 4,8,8 14727 add r1d, r1d 14728 add r3d, r3d 14729 lea r4, [3 * r1] 14730 lea r5, [3 * r3] 14731 pxor m6, m6 14732 mov r6, r0 14733 mov r7, r2 14734 14735 call calc_satd_16x8 14736 call calc_satd_16x8 14737 call calc_satd_16x8 14738 call calc_satd_16x8 14739 call calc_satd_16x8 14740 call calc_satd_16x8 14741 call calc_satd_16x8 14742 call calc_satd_16x8 14743 14744 lea r0, [r6 + 32] 14745 lea r2, [r7 + 32] 14746 14747 call calc_satd_16x8 14748 call calc_satd_16x8 14749 call calc_satd_16x8 14750 call calc_satd_16x8 14751 call calc_satd_16x8 14752 call calc_satd_16x8 14753 call calc_satd_16x8 14754 call calc_satd_16x8 14755 14756 vextracti128 xm7, m6, 1 14757 paddd xm6, xm7 14758 pxor xm7, xm7 14759 movhlps xm7, xm6 14760 paddd xm6, xm7 14761 pshufd xm7, xm6, 1 14762 paddd xm6, xm7 14763 movd eax, xm6 14764 RET 14765 14766cglobal pixel_satd_48x64, 4,8,8 14767 add r1d, r1d 14768 add r3d, r3d 14769 lea r4, [3 * r1] 14770 lea r5, [3 * r3] 14771 pxor m6, m6 14772 mov r6, r0 14773 mov r7, r2 14774 14775 call calc_satd_16x8 14776 call calc_satd_16x8 14777 call calc_satd_16x8 14778 call calc_satd_16x8 14779 call calc_satd_16x8 14780 call calc_satd_16x8 14781 call calc_satd_16x8 14782 call calc_satd_16x8 14783 14784 lea r0, [r6 + 32] 14785 lea r2, [r7 + 32] 14786 14787 call calc_satd_16x8 14788 call calc_satd_16x8 14789 call calc_satd_16x8 14790 call calc_satd_16x8 14791 call calc_satd_16x8 14792 call calc_satd_16x8 14793 call calc_satd_16x8 14794 call calc_satd_16x8 14795 14796 lea r0, [r6 + 64] 14797 lea r2, [r7 + 64] 14798 14799 call calc_satd_16x8 14800 call calc_satd_16x8 14801 call calc_satd_16x8 14802 call calc_satd_16x8 14803 call calc_satd_16x8 14804 call calc_satd_16x8 14805 call calc_satd_16x8 14806 call calc_satd_16x8 14807 14808 vextracti128 xm7, m6, 1 14809 paddd xm6, xm7 14810 pxor xm7, xm7 14811 movhlps xm7, xm6 14812 paddd xm6, xm7 14813 pshufd xm7, xm6, 1 14814 paddd xm6, xm7 14815 movd eax, xm6 14816 RET 14817 14818cglobal pixel_satd_64x16, 4,8,8 14819 add r1d, r1d 14820 add r3d, r3d 14821 lea r4, [3 * r1] 14822 lea r5, [3 * r3] 14823 pxor m6, m6 14824 mov r6, r0 14825 mov r7, r2 14826 14827 call calc_satd_16x8 14828 call calc_satd_16x8 14829 14830 lea r0, [r6 + 32] 14831 lea r2, [r7 + 32] 14832 14833 call calc_satd_16x8 14834 call calc_satd_16x8 14835 14836 lea r0, [r6 + 64] 14837 lea r2, [r7 + 64] 14838 14839 call calc_satd_16x8 14840 call calc_satd_16x8 14841 14842 lea r0, [r6 + 96] 14843 lea r2, [r7 + 96] 14844 14845 call calc_satd_16x8 14846 call calc_satd_16x8 14847 14848 vextracti128 xm7, m6, 1 14849 paddd xm6, xm7 14850 pxor xm7, xm7 14851 movhlps xm7, xm6 14852 paddd xm6, xm7 14853 pshufd xm7, xm6, 1 14854 paddd xm6, xm7 14855 movd eax, xm6 14856 RET 14857 14858cglobal pixel_satd_64x32, 4,8,8 14859 add r1d, r1d 14860 add r3d, r3d 14861 lea r4, [3 * r1] 14862 lea r5, [3 * r3] 14863 pxor m6, m6 14864 mov r6, r0 14865 mov r7, r2 14866 14867 call calc_satd_16x8 14868 call calc_satd_16x8 14869 call calc_satd_16x8 14870 call calc_satd_16x8 14871 14872 lea r0, [r6 + 32] 14873 lea r2, [r7 + 32] 14874 14875 call calc_satd_16x8 14876 call calc_satd_16x8 14877 call calc_satd_16x8 14878 call calc_satd_16x8 14879 14880 lea r0, [r6 + 64] 14881 lea r2, [r7 + 64] 14882 14883 call calc_satd_16x8 14884 call calc_satd_16x8 14885 call calc_satd_16x8 14886 call calc_satd_16x8 14887 14888 lea r0, [r6 + 96] 14889 lea r2, [r7 + 96] 14890 14891 call calc_satd_16x8 14892 call calc_satd_16x8 14893 call calc_satd_16x8 14894 call calc_satd_16x8 14895 14896 vextracti128 xm7, m6, 1 14897 paddd xm6, xm7 14898 pxor xm7, xm7 14899 movhlps xm7, xm6 14900 paddd xm6, xm7 14901 pshufd xm7, xm6, 1 14902 paddd xm6, xm7 14903 movd eax, xm6 14904 RET 14905 14906cglobal pixel_satd_64x48, 4,8,8 14907 add r1d, r1d 14908 add r3d, r3d 14909 lea r4, [3 * r1] 14910 lea r5, [3 * r3] 14911 pxor m6, m6 14912 mov r6, r0 14913 mov r7, r2 14914 14915 call calc_satd_16x8 14916 call calc_satd_16x8 14917 call calc_satd_16x8 14918 call calc_satd_16x8 14919 call calc_satd_16x8 14920 call calc_satd_16x8 14921 14922 lea r0, [r6 + 32] 14923 lea r2, [r7 + 32] 14924 14925 call calc_satd_16x8 14926 call calc_satd_16x8 14927 call calc_satd_16x8 14928 call calc_satd_16x8 14929 call calc_satd_16x8 14930 call calc_satd_16x8 14931 14932 lea r0, [r6 + 64] 14933 lea r2, [r7 + 64] 14934 14935 call calc_satd_16x8 14936 call calc_satd_16x8 14937 call calc_satd_16x8 14938 call calc_satd_16x8 14939 call calc_satd_16x8 14940 call calc_satd_16x8 14941 14942 lea r0, [r6 + 96] 14943 lea r2, [r7 + 96] 14944 14945 call calc_satd_16x8 14946 call calc_satd_16x8 14947 call calc_satd_16x8 14948 call calc_satd_16x8 14949 call calc_satd_16x8 14950 call calc_satd_16x8 14951 14952 vextracti128 xm7, m6, 1 14953 paddd xm6, xm7 14954 pxor xm7, xm7 14955 movhlps xm7, xm6 14956 paddd xm6, xm7 14957 pshufd xm7, xm6, 1 14958 paddd xm6, xm7 14959 movd eax, xm6 14960 RET 14961 14962cglobal pixel_satd_64x64, 4,8,8 14963 add r1d, r1d 14964 add r3d, r3d 14965 lea r4, [3 * r1] 14966 lea r5, [3 * r3] 14967 pxor m6, m6 14968 mov r6, r0 14969 mov r7, r2 14970 14971 call calc_satd_16x8 14972 call calc_satd_16x8 14973 call calc_satd_16x8 14974 call calc_satd_16x8 14975 call calc_satd_16x8 14976 call calc_satd_16x8 14977 call calc_satd_16x8 14978 call calc_satd_16x8 14979 14980 lea r0, [r6 + 32] 14981 lea r2, [r7 + 32] 14982 14983 call calc_satd_16x8 14984 call calc_satd_16x8 14985 call calc_satd_16x8 14986 call calc_satd_16x8 14987 call calc_satd_16x8 14988 call calc_satd_16x8 14989 call calc_satd_16x8 14990 call calc_satd_16x8 14991 14992 lea r0, [r6 + 64] 14993 lea r2, [r7 + 64] 14994 14995 call calc_satd_16x8 14996 call calc_satd_16x8 14997 call calc_satd_16x8 14998 call calc_satd_16x8 14999 call calc_satd_16x8 15000 call calc_satd_16x8 15001 call calc_satd_16x8 15002 call calc_satd_16x8 15003 15004 lea r0, [r6 + 96] 15005 lea r2, [r7 + 96] 15006 15007 call calc_satd_16x8 15008 call calc_satd_16x8 15009 call calc_satd_16x8 15010 call calc_satd_16x8 15011 call calc_satd_16x8 15012 call calc_satd_16x8 15013 call calc_satd_16x8 15014 call calc_satd_16x8 15015 15016 vextracti128 xm7, m6, 1 15017 paddd xm6, xm7 15018 pxor xm7, xm7 15019 movhlps xm7, xm6 15020 paddd xm6, xm7 15021 pshufd xm7, xm6, 1 15022 paddd xm6, xm7 15023 movd eax, xm6 15024 RET 15025 15026%macro SATD_HBD_AVX512_END 0 15027 vextracti32x8 ym7, m6, 1 15028 paddd ym6, ym7 15029 vextracti128 xm7, ym6, 1 15030 paddd xm6, xm7 15031 pxor xm7, xm7 15032 movhlps xm7, xm6 15033 paddd xm6, xm7 15034 pshufd xm7, xm6, 1 15035 paddd xm6, xm7 15036 movd eax, xm6 15037%endmacro 15038%macro PROCESS_SATD_16x8_HBD_AVX512 0 ; function to compute satd cost for 16 columns, 8 rows 15039 ; rows 0-3 15040 lea r6, [r0 + r1 * 4] 15041 lea r7, [r2 + r3 * 4] 15042 movu ym0, [r0] 15043 movu ym4, [r2] 15044 vinserti32x8 m0, [r6], 1 15045 vinserti32x8 m4, [r7], 1 15046 psubw m0, m4 15047 movu ym1, [r0 + r1] 15048 movu ym5, [r2 + r3] 15049 vinserti32x8 m1, [r6 + r1], 1 15050 vinserti32x8 m5, [r7 + r3], 1 15051 psubw m1, m5 15052 movu ym2, [r0 + r1 * 2] 15053 movu ym4, [r2 + r3 * 2] 15054 vinserti32x8 m2, [r6 + r1 * 2], 1 15055 vinserti32x8 m4, [r7 + r3 * 2], 1 15056 psubw m2, m4 15057 movu ym3, [r0 + r4] 15058 movu ym5, [r2 + r5] 15059 vinserti32x8 m3, [r6 + r4], 1 15060 vinserti32x8 m5, [r7 + r5], 1 15061 psubw m3, m5 15062 15063 paddw m4, m0, m1 15064 psubw m1, m0 15065 paddw m0, m2, m3 15066 psubw m3, m2 15067 punpckhwd m2, m4, m1 15068 punpcklwd m4, m1 15069 punpckhwd m1, m0, m3 15070 punpcklwd m0, m3 15071 paddw m3, m4, m0 15072 psubw m0, m4 15073 paddw m4, m2, m1 15074 psubw m1, m2 15075 punpckhdq m2, m3, m0 15076 punpckldq m3, m0 15077 paddw m0, m3, m2 15078 psubw m2, m3 15079 punpckhdq m3, m4, m1 15080 punpckldq m4, m1 15081 paddw m1, m4, m3 15082 psubw m3, m4 15083 punpckhqdq m4, m0, m1 15084 punpcklqdq m0, m1 15085 pabsw m0, m0 15086 pabsw m4, m4 15087 pmaxsw m0, m0, m4 15088 punpckhqdq m1, m2, m3 15089 punpcklqdq m2, m3 15090 pabsw m2, m2 15091 pabsw m1, m1 15092 pmaxsw m2, m1 15093 pxor m7, m7 15094 mova m1, m0 15095 punpcklwd m1, m7 15096 paddd m6, m1 15097 mova m1, m0 15098 punpckhwd m1, m7 15099 paddd m6, m1 15100 pxor m7, m7 15101 mova m1, m2 15102 punpcklwd m1, m7 15103 paddd m6, m1 15104 mova m1, m2 15105 punpckhwd m1, m7 15106 paddd m6, m1 15107%endmacro 15108%macro PROCESS_SATD_32x4_HBD_AVX512 0 ; function to compute satd cost for 32 columns, 4 rows 15109 ; rows 0-3 15110 movu m0, [r0] 15111 movu m4, [r2] 15112 psubw m0, m4 15113 movu m1, [r0 + r1] 15114 movu m5, [r2 + r3] 15115 psubw m1, m5 15116 movu m2, [r0 + r1 * 2] 15117 movu m4, [r2 + r3 * 2] 15118 psubw m2, m4 15119 movu m3, [r0 + r4] 15120 movu m5, [r2 + r5] 15121 psubw m3, m5 15122 paddw m4, m0, m1 15123 psubw m1, m0 15124 paddw m0, m2, m3 15125 psubw m3, m2 15126 punpckhwd m2, m4, m1 15127 punpcklwd m4, m1 15128 punpckhwd m1, m0, m3 15129 punpcklwd m0, m3 15130 paddw m3, m4, m0 15131 psubw m0, m4 15132 paddw m4, m2, m1 15133 psubw m1, m2 15134 punpckhdq m2, m3, m0 15135 punpckldq m3, m0 15136 paddw m0, m3, m2 15137 psubw m2, m3 15138 punpckhdq m3, m4, m1 15139 punpckldq m4, m1 15140 paddw m1, m4, m3 15141 psubw m3, m4 15142 punpckhqdq m4, m0, m1 15143 punpcklqdq m0, m1 15144 pabsw m0, m0 15145 pabsw m4, m4 15146 pmaxsw m0, m0, m4 15147 punpckhqdq m1, m2, m3 15148 punpcklqdq m2, m3 15149 pabsw m2, m2 15150 pabsw m1, m1 15151 pmaxsw m2, m1 15152 pxor m7, m7 15153 mova m1, m0 15154 punpcklwd m1, m7 15155 paddd m6, m1 15156 mova m1, m0 15157 punpckhwd m1, m7 15158 paddd m6, m1 15159 pxor m7, m7 15160 mova m1, m2 15161 punpcklwd m1, m7 15162 paddd m6, m1 15163 mova m1, m2 15164 punpckhwd m1, m7 15165 paddd m6, m1 15166%endmacro 15167 15168%macro SATD_16xN_HBD_AVX512 1 15169INIT_ZMM avx512 15170cglobal pixel_satd_16x%1, 4,8,8 15171 add r1d, r1d 15172 add r3d, r3d 15173 lea r4, [3 * r1] 15174 lea r5, [3 * r3] 15175 pxor m6, m6 15176 15177%rep %1/8 - 1 15178 PROCESS_SATD_16x8_HBD_AVX512 15179 lea r0, [r6 + 4 * r1] 15180 lea r2, [r7 + 4 * r3] 15181%endrep 15182 PROCESS_SATD_16x8_HBD_AVX512 15183 SATD_HBD_AVX512_END 15184 RET 15185%endmacro 15186 15187SATD_16xN_HBD_AVX512 8 15188SATD_16xN_HBD_AVX512 16 15189SATD_16xN_HBD_AVX512 32 15190SATD_16xN_HBD_AVX512 64 15191 15192%macro SATD_32xN_HBD_AVX512 1 15193INIT_ZMM avx512 15194cglobal pixel_satd_32x%1, 4,8,8 15195 add r1d, r1d 15196 add r3d, r3d 15197 lea r4, [3 * r1] 15198 lea r5, [3 * r3] 15199 pxor m6, m6 15200 mov r6, r0 15201 mov r7, r2 15202%rep %1/4 - 1 15203 PROCESS_SATD_32x4_HBD_AVX512 15204 lea r0, [r0 + 4 * r1] 15205 lea r2, [r2 + 4 * r3] 15206%endrep 15207 PROCESS_SATD_32x4_HBD_AVX512 15208 SATD_HBD_AVX512_END 15209 RET 15210%endmacro 15211 15212SATD_32xN_HBD_AVX512 8 15213SATD_32xN_HBD_AVX512 16 15214SATD_32xN_HBD_AVX512 24 15215SATD_32xN_HBD_AVX512 32 15216SATD_32xN_HBD_AVX512 64 15217INIT_ZMM avx512 15218cglobal pixel_satd_48x64, 4,10,8 15219 add r1d, r1d 15220 add r3d, r3d 15221 lea r4, [3 * r1] 15222 lea r5, [3 * r3] 15223 pxor m6, m6 15224 mov r8, r0 15225 mov r9, r2 15226 15227%rep 15 15228 PROCESS_SATD_32x4_HBD_AVX512 15229 lea r0, [r0 + 4 * r1] 15230 lea r2, [r2 + 4 * r3] 15231%endrep 15232 PROCESS_SATD_32x4_HBD_AVX512 15233 lea r0, [r8 + mmsize] 15234 lea r2, [r9 + mmsize] 15235%rep 7 15236 PROCESS_SATD_16x8_HBD_AVX512 15237 lea r0, [r6 + 4 * r1] 15238 lea r2, [r7 + 4 * r3] 15239%endrep 15240 PROCESS_SATD_16x8_HBD_AVX512 15241 SATD_HBD_AVX512_END 15242 RET 15243 15244%macro SATD_64xN_HBD_AVX512 1 15245INIT_ZMM avx512 15246cglobal pixel_satd_64x%1, 4,8,8 15247 add r1d, r1d 15248 add r3d, r3d 15249 lea r4, [3 * r1] 15250 lea r5, [3 * r3] 15251 pxor m6, m6 15252 mov r6, r0 15253 mov r7, r2 15254%rep %1/4 - 1 15255 PROCESS_SATD_32x4_HBD_AVX512 15256 lea r0, [r0 + 4 * r1] 15257 lea r2, [r2 + 4 * r3] 15258%endrep 15259 PROCESS_SATD_32x4_HBD_AVX512 15260 lea r0, [r6 + mmsize] 15261 lea r2, [r7 + mmsize] 15262%rep %1/4 - 1 15263 PROCESS_SATD_32x4_HBD_AVX512 15264 lea r0, [r0 + 4 * r1] 15265 lea r2, [r2 + 4 * r3] 15266%endrep 15267 PROCESS_SATD_32x4_HBD_AVX512 15268 SATD_HBD_AVX512_END 15269 RET 15270%endmacro 15271 15272SATD_64xN_HBD_AVX512 16 15273SATD_64xN_HBD_AVX512 32 15274SATD_64xN_HBD_AVX512 48 15275SATD_64xN_HBD_AVX512 64 15276%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1 15277 15278 15279;------------------------------------------------------------------------------------------------------------------------------------- 15280; pixel planeClipAndMax(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix) 15281;------------------------------------------------------------------------------------------------------------------------------------- 15282%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 15283INIT_YMM avx2 15284cglobal planeClipAndMax, 5,7,8 15285 movd xm0, r5m 15286 vpbroadcastb m0, xm0 ; m0 = [min] 15287 vpbroadcastb m1, r6m ; m1 = [max] 15288 pxor m2, m2 ; m2 = sumLuma 15289 pxor m3, m3 ; m3 = maxLumaLevel 15290 pxor m4, m4 ; m4 = zero 15291 15292 ; get mask to partial register pixels 15293 mov r5d, r2d 15294 and r2d, ~(mmsize - 1) 15295 sub r5d, r2d 15296 lea r6, [pb_movemask_32 + mmsize] 15297 sub r6, r5 15298 movu m5, [r6] ; m5 = mask for last couple column 15299 15300.loopH: 15301 lea r5d, [r2 - mmsize] 15302 15303.loopW: 15304 movu m6, [r0 + r5] 15305 pmaxub m6, m0 15306 pminub m6, m1 15307 movu [r0 + r5], m6 ; store back 15308 pmaxub m3, m6 ; update maxLumaLevel 15309 psadbw m6, m4 15310 paddq m2, m6 15311 15312 sub r5d, mmsize 15313 jge .loopW 15314 15315 ; partial pixels 15316 movu m7, [r0 + r2] 15317 pmaxub m6, m7, m0 15318 pminub m6, m1 15319 15320 pand m7, m5 ; get invalid/unchange pixel 15321 pandn m6, m5, m6 ; clear invalid pixels 15322 por m7, m6 ; combin valid & invalid pixels 15323 movu [r0 + r2], m7 ; store back 15324 pmaxub m3, m6 ; update maxLumaLevel 15325 psadbw m6, m4 15326 paddq m2, m6 15327 15328.next: 15329 add r0, r1 15330 dec r3d 15331 jg .loopH 15332 15333 ; sumLuma 15334 vextracti128 xm0, m2, 1 15335 paddq xm0, xm2 15336 movhlps xm1, xm0 15337 paddq xm0, xm1 15338 movq [r4], xm0 15339 15340 ; maxLumaLevel 15341 vextracti128 xm0, m3, 1 15342 pmaxub xm0, xm3 15343 movhlps xm3, xm0 15344 pmaxub xm0, xm3 15345 pmovzxbw xm0, xm0 15346 pxor xm0, [pb_movemask + 16] 15347 phminposuw xm0, xm0 15348 15349 movd eax, xm0 15350 not al 15351 movzx eax, al 15352 RET 15353%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 15354 15355 15356%if HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10 15357%macro LOAD_DIFF_AVX2 4 15358 movu %1, %3 15359 movu %2, %4 15360 psubw %1, %2 15361%endmacro 15362 15363%macro LOAD_DIFF_8x4P_AVX2 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer 15364 LOAD_DIFF_AVX2 xm%1, xm%5, [%7], [%8] 15365 LOAD_DIFF_AVX2 xm%2, xm%6, [%7+r1], [%8+r3] 15366 LOAD_DIFF_AVX2 xm%3, xm%5, [%7+2*r1], [%8+2*r3] 15367 LOAD_DIFF_AVX2 xm%4, xm%6, [%7+r4], [%8+r5] 15368 15369 ;lea %7, [%7+4*r1] 15370 ;lea %8, [%8+4*r3] 15371%endmacro 15372 15373%if ARCH_X86_64 15374INIT_YMM avx2 15375cglobal pixel_satd_8x8, 4,4,7 15376 15377 FIX_STRIDES r1, r3 15378 pxor xm6, xm6 15379 15380 ; load_diff 0 & 4 15381 movu xm0, [r0] 15382 movu xm1, [r2] 15383 vinserti128 m0, m0, [r0 + r1 * 4], 1 15384 vinserti128 m1, m1, [r2 + r3 * 4], 1 15385 psubw m0, m1 15386 add r0, r1 15387 add r2, r3 15388 15389 ; load_diff 1 & 5 15390 movu xm1, [r0] 15391 movu xm2, [r2] 15392 vinserti128 m1, m1, [r0 + r1 * 4], 1 15393 vinserti128 m2, m2, [r2 + r3 * 4], 1 15394 psubw m1, m2 15395 add r0, r1 15396 add r2, r3 15397 15398 ; load_diff 2 & 6 15399 movu xm2, [r0] 15400 movu xm3, [r2] 15401 vinserti128 m2, m2, [r0 + r1 * 4], 1 15402 vinserti128 m3, m3, [r2 + r3 * 4], 1 15403 psubw m2, m3 15404 add r0, r1 15405 add r2, r3 15406 15407 ; load_diff 3 & 7 15408 movu xm3, [r0] 15409 movu xm4, [r2] 15410 vinserti128 m3, m3, [r0 + r1 * 4], 1 15411 vinserti128 m4, m4, [r2 + r3 * 4], 1 15412 psubw m3, m4 15413 15414 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 15415 15416 vextracti128 xm0, m6, 1 15417 paddw xm6, xm0 15418 HADDUW xm6, xm0 15419 movd eax, xm6 15420 RET 15421 15422INIT_XMM avx2 15423cglobal pixel_sa8d_8x8_internal 15424 lea r6, [r0+4*r1] 15425 lea r7, [r2+4*r3] 15426 LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 15427 LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 15428 15429 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax 15430 ;HADAMARD2_2D 0, 1, 2, 8, 6, wd 15431 ;HADAMARD2_2D 4, 5, 3, 9, 6, wd 15432 ;HADAMARD2_2D 0, 2, 1, 8, 6, dq 15433 ;HADAMARD2_2D 4, 3, 5, 9, 6, dq 15434 ;HADAMARD2_2D 0, 4, 2, 3, 6, qdq, amax 15435 ;HADAMARD2_2D 1, 5, 8, 9, 6, qdq, amax 15436 15437 paddw m0, m1 15438 paddw m0, m2 15439 paddw m0, m8 15440 SAVE_MM_PERMUTATION 15441 ret 15442 15443 15444INIT_XMM avx2 15445cglobal pixel_sa8d_8x8, 4,8,12 15446 FIX_STRIDES r1, r3 15447 lea r4, [3*r1] 15448 lea r5, [3*r3] 15449 call pixel_sa8d_8x8_internal 15450 HADDUW m0, m1 15451 movd eax, m0 15452 add eax, 1 15453 shr eax, 1 15454 RET 15455 15456 15457INIT_YMM avx2 15458cglobal pixel_sa8d_16x16, 4,8,12 15459 FIX_STRIDES r1, r3 15460 lea r4, [3*r1] 15461 lea r5, [3*r3] 15462 lea r6, [r0+4*r1] 15463 lea r7, [r2+4*r3] 15464 vbroadcasti128 m7, [pw_1] 15465 15466 ; Top 16x8 15467 ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 15468 movu m0, [r0] ; 10 bits 15469 movu m5, [r2] 15470 psubw m0, m5 ; 11 bits 15471 movu m1, [r0 + r1] 15472 movu m6, [r2 + r3] 15473 psubw m1, m6 15474 movu m2, [r0 + r1 * 2] 15475 movu m5, [r2 + r3 * 2] 15476 psubw m2, m5 15477 movu m8, [r0 + r4] 15478 movu m6, [r2 + r5] 15479 psubw m8, m6 15480 15481 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 15482 movu m4, [r6] 15483 movu m11, [r7] 15484 psubw m4, m11 15485 movu m5, [r6 + r1] 15486 movu m6, [r7 + r3] 15487 psubw m5, m6 15488 movu m3, [r6 + r1 * 2] 15489 movu m11, [r7 + r3 * 2] 15490 psubw m3, m11 15491 movu m9, [r6 + r4] 15492 movu m6, [r7 + r5] 15493 psubw m9, m6 15494 15495 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax ; 16 bits 15496 pmaddwd m0, m7 15497 pmaddwd m1, m7 15498 pmaddwd m2, m7 15499 pmaddwd m8, m7 15500 paddd m0, m1 15501 paddd m2, m8 15502 paddd m10, m0, m2 15503 15504 lea r0, [r0+8*r1] 15505 lea r2, [r2+8*r3] 15506 lea r6, [r6+8*r1] 15507 lea r7, [r7+8*r3] 15508 15509 ; Bottom 16x8 15510 ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 15511 movu m0, [r0] 15512 movu m5, [r2] 15513 psubw m0, m5 15514 movu m1, [r0 + r1] 15515 movu m6, [r2 + r3] 15516 psubw m1, m6 15517 movu m2, [r0 + r1 * 2] 15518 movu m5, [r2 + r3 * 2] 15519 psubw m2, m5 15520 movu m8, [r0 + r4] 15521 movu m6, [r2 + r5] 15522 psubw m8, m6 15523 15524 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 15525 movu m4, [r6] 15526 movu m11, [r7] 15527 psubw m4, m11 15528 movu m5, [r6 + r1] 15529 movu m6, [r7 + r3] 15530 psubw m5, m6 15531 movu m3, [r6 + r1 * 2] 15532 movu m11, [r7 + r3 * 2] 15533 psubw m3, m11 15534 movu m9, [r6 + r4] 15535 movu m6, [r7 + r5] 15536 psubw m9, m6 15537 15538 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax 15539 pmaddwd m0, m7 15540 pmaddwd m1, m7 15541 pmaddwd m2, m7 15542 pmaddwd m8, m7 15543 paddd m0, m1 15544 paddd m2, m8 15545 paddd m10, m0 15546 paddd m10, m2 15547 15548 HADDD m10, m0 15549 15550 movd eax, xm10 15551 add eax, 1 15552 shr eax, 1 15553 RET 15554 15555 15556; TODO: optimize me, need more 2 of YMM registers because C model get partial result every 16x16 block 15557INIT_YMM avx2 15558cglobal pixel_sa8d_32x32, 4,8,14 15559 FIX_STRIDES r1, r3 15560 lea r4, [3*r1] 15561 lea r5, [3*r3] 15562 lea r6, [r0+4*r1] 15563 lea r7, [r2+4*r3] 15564 vbroadcasti128 m7, [pw_1] 15565 15566 15567 ;SA8D[16x8] ; pix[0] 15568 ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 15569 movu m0, [r0] 15570 movu m5, [r2] 15571 psubw m0, m5 15572 movu m1, [r0 + r1] 15573 movu m6, [r2 + r3] 15574 psubw m1, m6 15575 movu m2, [r0 + r1 * 2] 15576 movu m5, [r2 + r3 * 2] 15577 psubw m2, m5 15578 movu m8, [r0 + r4] 15579 movu m6, [r2 + r5] 15580 psubw m8, m6 15581 15582 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 15583 movu m4, [r6] 15584 movu m11, [r7] 15585 psubw m4, m11 15586 movu m5, [r6 + r1] 15587 movu m6, [r7 + r3] 15588 psubw m5, m6 15589 movu m3, [r6 + r1 * 2] 15590 movu m11, [r7 + r3 * 2] 15591 psubw m3, m11 15592 movu m9, [r6 + r4] 15593 movu m6, [r7 + r5] 15594 psubw m9, m6 15595 15596 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax 15597 pmaddwd m0, m7 15598 pmaddwd m1, m7 15599 pmaddwd m2, m7 15600 pmaddwd m8, m7 15601 paddd m0, m1 15602 paddd m2, m8 15603 paddd m10, m0, m2 15604 15605 15606 ; SA8D[16x8] ; pix[16] 15607 add r0, mmsize 15608 add r2, mmsize 15609 add r6, mmsize 15610 add r7, mmsize 15611 15612 ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 15613 movu m0, [r0] 15614 movu m5, [r2] 15615 psubw m0, m5 15616 movu m1, [r0 + r1] 15617 movu m6, [r2 + r3] 15618 psubw m1, m6 15619 movu m2, [r0 + r1 * 2] 15620 movu m5, [r2 + r3 * 2] 15621 psubw m2, m5 15622 movu m8, [r0 + r4] 15623 movu m6, [r2 + r5] 15624 psubw m8, m6 15625 15626 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 15627 movu m4, [r6] 15628 movu m11, [r7] 15629 psubw m4, m11 15630 movu m5, [r6 + r1] 15631 movu m6, [r7 + r3] 15632 psubw m5, m6 15633 movu m3, [r6 + r1 * 2] 15634 movu m11, [r7 + r3 * 2] 15635 psubw m3, m11 15636 movu m9, [r6 + r4] 15637 movu m6, [r7 + r5] 15638 psubw m9, m6 15639 15640 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax 15641 pmaddwd m0, m7 15642 pmaddwd m1, m7 15643 pmaddwd m2, m7 15644 pmaddwd m8, m7 15645 paddd m0, m1 15646 paddd m2, m8 15647 paddd m12, m0, m2 15648 15649 15650 ; SA8D[16x8] ; pix[8*stride+16] 15651 lea r0, [r0+8*r1] 15652 lea r2, [r2+8*r3] 15653 lea r6, [r6+8*r1] 15654 lea r7, [r7+8*r3] 15655 15656 ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 15657 movu m0, [r0] 15658 movu m5, [r2] 15659 psubw m0, m5 15660 movu m1, [r0 + r1] 15661 movu m6, [r2 + r3] 15662 psubw m1, m6 15663 movu m2, [r0 + r1 * 2] 15664 movu m5, [r2 + r3 * 2] 15665 psubw m2, m5 15666 movu m8, [r0 + r4] 15667 movu m6, [r2 + r5] 15668 psubw m8, m6 15669 15670 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 15671 movu m4, [r6] 15672 movu m11, [r7] 15673 psubw m4, m11 15674 movu m5, [r6 + r1] 15675 movu m6, [r7 + r3] 15676 psubw m5, m6 15677 movu m3, [r6 + r1 * 2] 15678 movu m11, [r7 + r3 * 2] 15679 psubw m3, m11 15680 movu m9, [r6 + r4] 15681 movu m6, [r7 + r5] 15682 psubw m9, m6 15683 15684 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax 15685 pmaddwd m0, m7 15686 pmaddwd m1, m7 15687 pmaddwd m2, m7 15688 pmaddwd m8, m7 15689 paddd m0, m1 15690 paddd m2, m8 15691 paddd m12, m0 15692 paddd m12, m2 15693 15694 ; sum[1] 15695 HADDD m12, m0 15696 15697 15698 ; SA8D[16x8] ; pix[8*stride] 15699 sub r0, mmsize 15700 sub r2, mmsize 15701 sub r6, mmsize 15702 sub r7, mmsize 15703 15704 ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 15705 movu m0, [r0] 15706 movu m5, [r2] 15707 psubw m0, m5 15708 movu m1, [r0 + r1] 15709 movu m6, [r2 + r3] 15710 psubw m1, m6 15711 movu m2, [r0 + r1 * 2] 15712 movu m5, [r2 + r3 * 2] 15713 psubw m2, m5 15714 movu m8, [r0 + r4] 15715 movu m6, [r2 + r5] 15716 psubw m8, m6 15717 15718 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 15719 movu m4, [r6] 15720 movu m11, [r7] 15721 psubw m4, m11 15722 movu m5, [r6 + r1] 15723 movu m6, [r7 + r3] 15724 psubw m5, m6 15725 movu m3, [r6 + r1 * 2] 15726 movu m11, [r7 + r3 * 2] 15727 psubw m3, m11 15728 movu m9, [r6 + r4] 15729 movu m6, [r7 + r5] 15730 psubw m9, m6 15731 15732 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax 15733 pmaddwd m0, m7 15734 pmaddwd m1, m7 15735 pmaddwd m2, m7 15736 pmaddwd m8, m7 15737 paddd m0, m1 15738 paddd m2, m8 15739 paddd m10, m0 15740 paddd m10, m2 15741 15742 ; sum[0] 15743 HADDD m10, m0 15744 punpckldq xm10, xm12 15745 15746 15747 ;SA8D[16x8] ; pix[16*stridr] 15748 lea r0, [r0+8*r1] 15749 lea r2, [r2+8*r3] 15750 lea r6, [r6+8*r1] 15751 lea r7, [r7+8*r3] 15752 15753 ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 15754 movu m0, [r0] 15755 movu m5, [r2] 15756 psubw m0, m5 15757 movu m1, [r0 + r1] 15758 movu m6, [r2 + r3] 15759 psubw m1, m6 15760 movu m2, [r0 + r1 * 2] 15761 movu m5, [r2 + r3 * 2] 15762 psubw m2, m5 15763 movu m8, [r0 + r4] 15764 movu m6, [r2 + r5] 15765 psubw m8, m6 15766 15767 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 15768 movu m4, [r6] 15769 movu m11, [r7] 15770 psubw m4, m11 15771 movu m5, [r6 + r1] 15772 movu m6, [r7 + r3] 15773 psubw m5, m6 15774 movu m3, [r6 + r1 * 2] 15775 movu m11, [r7 + r3 * 2] 15776 psubw m3, m11 15777 movu m9, [r6 + r4] 15778 movu m6, [r7 + r5] 15779 psubw m9, m6 15780 15781 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax 15782 pmaddwd m0, m7 15783 pmaddwd m1, m7 15784 pmaddwd m2, m7 15785 pmaddwd m8, m7 15786 paddd m0, m1 15787 paddd m2, m8 15788 paddd m12, m0, m2 15789 15790 15791 ; SA8D[16x8] ; pix[16*stride+16] 15792 add r0, mmsize 15793 add r2, mmsize 15794 add r6, mmsize 15795 add r7, mmsize 15796 15797 ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 15798 movu m0, [r0] 15799 movu m5, [r2] 15800 psubw m0, m5 15801 movu m1, [r0 + r1] 15802 movu m6, [r2 + r3] 15803 psubw m1, m6 15804 movu m2, [r0 + r1 * 2] 15805 movu m5, [r2 + r3 * 2] 15806 psubw m2, m5 15807 movu m8, [r0 + r4] 15808 movu m6, [r2 + r5] 15809 psubw m8, m6 15810 15811 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 15812 movu m4, [r6] 15813 movu m11, [r7] 15814 psubw m4, m11 15815 movu m5, [r6 + r1] 15816 movu m6, [r7 + r3] 15817 psubw m5, m6 15818 movu m3, [r6 + r1 * 2] 15819 movu m11, [r7 + r3 * 2] 15820 psubw m3, m11 15821 movu m9, [r6 + r4] 15822 movu m6, [r7 + r5] 15823 psubw m9, m6 15824 15825 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax 15826 pmaddwd m0, m7 15827 pmaddwd m1, m7 15828 pmaddwd m2, m7 15829 pmaddwd m8, m7 15830 paddd m0, m1 15831 paddd m2, m8 15832 paddd m13, m0, m2 15833 15834 15835 ; SA8D[16x8] ; pix[24*stride+16] 15836 lea r0, [r0+8*r1] 15837 lea r2, [r2+8*r3] 15838 lea r6, [r6+8*r1] 15839 lea r7, [r7+8*r3] 15840 15841 ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 15842 movu m0, [r0] 15843 movu m5, [r2] 15844 psubw m0, m5 15845 movu m1, [r0 + r1] 15846 movu m6, [r2 + r3] 15847 psubw m1, m6 15848 movu m2, [r0 + r1 * 2] 15849 movu m5, [r2 + r3 * 2] 15850 psubw m2, m5 15851 movu m8, [r0 + r4] 15852 movu m6, [r2 + r5] 15853 psubw m8, m6 15854 15855 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 15856 movu m4, [r6] 15857 movu m11, [r7] 15858 psubw m4, m11 15859 movu m5, [r6 + r1] 15860 movu m6, [r7 + r3] 15861 psubw m5, m6 15862 movu m3, [r6 + r1 * 2] 15863 movu m11, [r7 + r3 * 2] 15864 psubw m3, m11 15865 movu m9, [r6 + r4] 15866 movu m6, [r7 + r5] 15867 psubw m9, m6 15868 15869 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax 15870 pmaddwd m0, m7 15871 pmaddwd m1, m7 15872 pmaddwd m2, m7 15873 pmaddwd m8, m7 15874 paddd m0, m1 15875 paddd m2, m8 15876 paddd m13, m0 15877 paddd m13, m2 15878 15879 ; sum[3] 15880 HADDD m13, m0 15881 15882 15883 ; SA8D[16x8] ; pix[24*stride] 15884 sub r0, mmsize 15885 sub r2, mmsize 15886 sub r6, mmsize 15887 sub r7, mmsize 15888 15889 ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 15890 movu m0, [r0] 15891 movu m5, [r2] 15892 psubw m0, m5 15893 movu m1, [r0 + r1] 15894 movu m6, [r2 + r3] 15895 psubw m1, m6 15896 movu m2, [r0 + r1 * 2] 15897 movu m5, [r2 + r3 * 2] 15898 psubw m2, m5 15899 movu m8, [r0 + r4] 15900 movu m6, [r2 + r5] 15901 psubw m8, m6 15902 15903 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 15904 movu m4, [r6] 15905 movu m11, [r7] 15906 psubw m4, m11 15907 movu m5, [r6 + r1] 15908 movu m6, [r7 + r3] 15909 psubw m5, m6 15910 movu m3, [r6 + r1 * 2] 15911 movu m11, [r7 + r3 * 2] 15912 psubw m3, m11 15913 movu m9, [r6 + r4] 15914 movu m6, [r7 + r5] 15915 psubw m9, m6 15916 15917 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax 15918 pmaddwd m0, m7 15919 pmaddwd m1, m7 15920 pmaddwd m2, m7 15921 pmaddwd m8, m7 15922 paddd m0, m1 15923 paddd m2, m8 15924 paddd m12, m0 15925 paddd m12, m2 15926 15927 ; sum[2] 15928 HADDD m12, m0 15929 punpckldq xm12, xm13 15930 15931 ; SA8D 15932 punpcklqdq xm0, xm10, xm12 15933 paddd xm0, [pd_1] 15934 psrld xm0, 1 15935 HADDD xm0, xm1 15936 15937 movd eax, xm0 15938 RET 15939%endif 15940%endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10 15941 15942;template<int log2TrSize> 15943;static void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k) 15944;{ 15945; *ssBlock = 0; 15946; const uint32_t trSize = 1 << log2TrSize; 15947; for (int y = 0; y < trSize; y++) 15948; { 15949; for (int x = 0; x < trSize; x++) 15950; { 15951; int temp = fenc[y * fStride + x] - recon[y * rstride + x]; // copy of residual coeff 15952; *ssBlock += temp * temp; 15953; } 15954; } 15955; 15956; *ac_k = 0; 15957; for (int block_yy = 0; block_yy < trSize; block_yy += 1) 15958; { 15959; for (int block_xx = 0; block_xx < trSize; block_xx += 1) 15960; { 15961; uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift; 15962; *ac_k += temp * temp; 15963; } 15964; } 15965;} 15966;----------------------------------------------------------------------------------------------------------------- 15967; void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k) 15968;----------------------------------------------------------------------------------------------------------------- 15969 15970INIT_YMM avx2 15971cglobal ssimDist4, 7, 8, 8 15972 mov r5d, 4 15973 vpxor m4, m4 ;ssBlock 15974 vpxor m3, m3 15975 vpxor m7, m7 ;ac_k 15976.row: 15977%if HIGH_BIT_DEPTH 15978 vpmovzxwq m0, [r0] ;fenc 15979 vpmovzxwq m1, [r2] ;recon 15980%elif BIT_DEPTH == 8 15981 vpmovzxbq m0, [r0] 15982 vpmovzxbq m1, [r2] 15983%else 15984 %error Unsupported BIT_DEPTH! 15985%endif 15986 vpsrlq m6, m0, SSIMRD_SHIFT 15987 vpsubq m0, m1 15988 vpmuldq m0, m0, m0 15989 vpmuldq m6, m6, m6 15990 vpaddq m4, m0 15991 vpaddq m7, m6 15992 15993%if HIGH_BIT_DEPTH 15994 lea r0, [r0 + 2 * r1] 15995 lea r2, [r2 + 2 * r3] 15996%else 15997 lea r0, [r0 + r1] 15998 lea r2, [r2 + r3] 15999%endif 16000 dec r5d 16001 jnz .row 16002 vextracti128 xm5, m4, 1 16003 vpaddq xm4, xm5 16004 punpckhqdq xm2, xm4, xm3 16005 paddq xm4, xm2 16006 16007 vextracti128 xm5, m7, 1 16008 vpaddq xm7, xm5 16009 punpckhqdq xm2, xm7, xm3 16010 paddq xm7, xm2 16011 16012 movq [r4], xm4 16013 movq [r6], xm7 16014 RET 16015 16016 16017INIT_YMM avx2 16018cglobal ssimDist8, 7, 8, 8 16019 mov r5d, 8 16020 vpxor m4, m4 ;ssBlock 16021 vpxor m3, m3 16022 vpxor m7, m7 ;ac_k 16023.row: 16024%if HIGH_BIT_DEPTH 16025 vpmovzxwd m0, [r0] ;fenc 16026 vpmovzxwd m1, [r2] ;recon 16027%elif BIT_DEPTH == 8 16028 vpmovzxbd m0, [r0] 16029 vpmovzxbd m1, [r2] 16030%else 16031 %error Unsupported BIT_DEPTH! 16032%endif 16033 16034 SSIM_DIST_HIGH m0, m1 16035 16036%if HIGH_BIT_DEPTH 16037 lea r0, [r0 + 2 * r1] 16038 lea r2, [r2 + 2 * r3] 16039%else 16040 lea r0, [r0 + r1] 16041 lea r2, [r2 + r3] 16042%endif 16043 dec r5d 16044 jnz .row 16045 vextracti128 xm5, m4, 1 16046 vpaddq xm4, xm5 16047 punpckhqdq xm2, xm4, xm3 16048 paddq xm4, xm2 16049 16050 vextracti128 xm5, m7, 1 16051 vpaddq xm7, xm5 16052 punpckhqdq xm2, xm7, xm3 16053 paddq xm7, xm2 16054 16055 movq [r4], xm4 16056 movq [r6], xm7 16057 RET 16058 16059 16060INIT_YMM avx2 16061cglobal ssimDist16, 7, 8, 8 16062 mov r5d, 16 16063 vpxor m4, m4 ;ssBlock 16064 vpxor m3, m3 16065 vpxor m7, m7 ;ac_k 16066.row: 16067%if HIGH_BIT_DEPTH 16068;Col 1-8 16069 vpmovzxwd m0, [r0] ;fenc 16070 vpmovzxwd m1, [r2] ;recon 16071 16072 SSIM_DIST_HIGH m0, m1 16073 16074;Col 9-16 16075 vpmovzxwd m0, [r0 + 16] 16076 vpmovzxwd m1, [r2 + 16] 16077 16078 SSIM_DIST_HIGH m0, m1 16079 16080 lea r0, [r0 + 2 * r1] 16081 lea r2, [r2 + 2 * r3] 16082%elif BIT_DEPTH == 8 16083;col 1- 16 16084 vpmovzxbw m0, [r0] ;fenc 16085 vpmovzxbw m1, [r2] ;recon 16086 16087 SSIM_DIST_LOW m0, m1 16088 16089 lea r0, [r0 + r1] 16090 lea r2, [r2 + r3] 16091%else 16092 %error Unsupported BIT_DEPTH! 16093%endif 16094 dec r5d 16095 jnz .row 16096 16097%if HIGH_BIT_DEPTH 16098 vextracti128 xm5, m4, 1 16099 vpaddq xm4, xm5 16100 punpckhqdq xm2, xm4, xm3 16101 paddq xm4, xm2 16102 16103 vextracti128 xm5, m7, 1 16104 vpaddq xm7, xm5 16105 punpckhqdq xm2, xm7, xm3 16106 paddq xm7, xm2 16107%else 16108 vextracti128 xm5, m4, 1 16109 vpaddd xm4, xm5 16110 punpckhqdq xm2, xm4, xm3 16111 paddd xm4, xm2 16112 punpckldq xm4, xm4, xm3 16113 punpckhqdq xm2, xm4, xm3 16114 paddd xm4, xm2 16115 16116 vextracti128 xm5, m7, 1 16117 vpaddd xm7, xm5 16118 punpckhqdq xm2, xm7, xm3 16119 paddd xm7, xm2 16120 punpckldq xm7, xm7, xm3 16121 punpckhqdq xm2, xm7, xm3 16122 paddd xm7, xm2 16123%endif 16124 movq [r4], xm4 16125 movq [r6], xm7 16126 RET 16127 16128 16129INIT_YMM avx2 16130cglobal ssimDist32, 7, 8, 8 16131 mov r5d, 32 16132 vpxor m4, m4 ;ssBlock 16133 vpxor m3, m3 16134 vpxor m7, m7 ;ac_k 16135.row: 16136%if HIGH_BIT_DEPTH 16137;Col 1-8 16138 vpmovzxwd m0, [r0] ;fenc 16139 vpmovzxwd m1, [r2] ;recon 16140 16141 SSIM_DIST_HIGH m0, m1 16142 16143;Col 9-16 16144 vpmovzxwd m0, [r0 + 16] 16145 vpmovzxwd m1, [r2 + 16] 16146 16147 SSIM_DIST_HIGH m0, m1 16148 16149;Col 17-24 16150 vpmovzxwd m0, [r0 + 32] 16151 vpmovzxwd m1, [r2 + 32] 16152 16153 SSIM_DIST_HIGH m0, m1 16154 16155;Col 25-32 16156 vpmovzxwd m0, [r0 + 48] 16157 vpmovzxwd m1, [r2 + 48] 16158 16159 SSIM_DIST_HIGH m0, m1 16160 16161 lea r0, [r0 + 2 * r1] 16162 lea r2, [r2 + 2 * r3] 16163%elif BIT_DEPTH == 8 16164;col 1-16 16165 vpmovzxbw m0, [r0] ;fenc 16166 vpmovzxbw m1, [r2] ;recon 16167 16168 SSIM_DIST_LOW m0, m1 16169 16170;col 17-32 16171 vpmovzxbw m0, [r0 + 16] 16172 vpmovzxbw m1, [r2 + 16] 16173 16174 SSIM_DIST_LOW m0, m1 16175 16176 lea r0, [r0 + r1] 16177 lea r2, [r2 + r3] 16178%else 16179 %error Unsupported BIT_DEPTH! 16180%endif 16181 dec r5d 16182 jnz .row 16183 16184%if HIGH_BIT_DEPTH 16185 vextracti128 xm5, m4, 1 16186 vpaddq xm4, xm5 16187 punpckhqdq xm2, xm4, xm3 16188 paddq xm4, xm2 16189 16190 vextracti128 xm5, m7, 1 16191 vpaddq xm7, xm5 16192 punpckhqdq xm2, xm7, xm3 16193 paddq xm7, xm2 16194%else 16195 vextracti128 xm5, m4, 1 16196 vpaddd xm4, xm5 16197 punpckhqdq xm2, xm4, xm3 16198 paddd xm4, xm2 16199 punpckldq xm4, xm4, xm3 16200 punpckhqdq xm2, xm4, xm3 16201 paddd xm4, xm2 16202 16203 vextracti128 xm5, m7, 1 16204 vpaddd xm7, xm5 16205 punpckhqdq xm2, xm7, xm3 16206 paddd xm7, xm2 16207 punpckldq xm7, xm7, xm3 16208 punpckhqdq xm2, xm7, xm3 16209 paddd xm7, xm2 16210%endif 16211 movq [r4], xm4 16212 movq [r6], xm7 16213 RET 16214 16215 16216INIT_YMM avx2 16217cglobal ssimDist64, 7, 8, 8 16218 mov r5d, 64 16219 vpxor m4, m4 ;ssBlock 16220 vpxor m3, m3 16221 vpxor m7, m7 ;ac_k 16222.row: 16223%if HIGH_BIT_DEPTH 16224;Col 1-8 16225 vpmovzxwd m0, [r0] ;fenc 16226 vpmovzxwd m1, [r2] ;recon 16227 16228 SSIM_DIST_HIGH m0, m1 16229 16230;Col 9-16 16231 vpmovzxwd m0, [r0 + 16] 16232 vpmovzxwd m1, [r2 + 16] 16233 16234 SSIM_DIST_HIGH m0, m1 16235 16236;Col 17-24 16237 vpmovzxwd m0, [r0 + 32] 16238 vpmovzxwd m1, [r2 + 32] 16239 16240 SSIM_DIST_HIGH m0, m1 16241 16242;Col 25-32 16243 vpmovzxwd m0, [r0 + 48] 16244 vpmovzxwd m1, [r2 + 48] 16245 16246 SSIM_DIST_HIGH m0, m1 16247 16248;Col 33-40 16249 vpmovzxwd m0, [r0 + 64] 16250 vpmovzxwd m1, [r2 + 64] 16251 16252 SSIM_DIST_HIGH m0, m1 16253 16254;Col 41-48 16255 vpmovzxwd m0, [r0 + 80] 16256 vpmovzxwd m1, [r2 + 80] 16257 16258 SSIM_DIST_HIGH m0, m1 16259 16260;Col 49-56 16261 vpmovzxwd m0, [r0 + 96] 16262 vpmovzxwd m1, [r2 + 96] 16263 16264 SSIM_DIST_HIGH m0, m1 16265 16266;Col 57-64 16267 vpmovzxwd m0, [r0 + 112] 16268 vpmovzxwd m1, [r2 + 112] 16269 16270 SSIM_DIST_HIGH m0, m1 16271 16272 lea r0, [r0 + 2 * r1] 16273 lea r2, [r2 + 2 * r3] 16274%elif BIT_DEPTH == 8 16275;col 1-16 16276 vpmovzxbw m0, [r0] ;fenc 16277 vpmovzxbw m1, [r2] ;recon 16278 16279 SSIM_DIST_LOW m0, m1 16280 16281;col 17-32 16282 vpmovzxbw m0, [r0 + 16] 16283 vpmovzxbw m1, [r2 + 16] 16284 16285 SSIM_DIST_LOW m0, m1 16286 16287;col 33-48 16288 vpmovzxbw m0, [r0 + 32] 16289 vpmovzxbw m1, [r2 + 32] 16290 16291 SSIM_DIST_LOW m0, m1 16292 16293;col 49-64 16294 vpmovzxbw m0, [r0 + 48] 16295 vpmovzxbw m1, [r2 + 48] 16296 16297 SSIM_DIST_LOW m0, m1 16298 16299 lea r0, [r0 + r1] 16300 lea r2, [r2 + r3] 16301%endif 16302 dec r5d 16303 jnz .row 16304 16305%if HIGH_BIT_DEPTH 16306 vextracti128 xm5, m4, 1 16307 vpaddq xm4, xm5 16308 punpckhqdq xm2, xm4, xm3 16309 paddq xm4, xm2 16310 16311 vextracti128 xm5, m7, 1 16312 vpaddq xm7, xm5 16313 punpckhqdq xm2, xm7, xm3 16314 paddq xm7, xm2 16315%else 16316 vextracti128 xm5, m4, 1 16317 vpaddd xm4, xm5 16318 punpckhqdq xm2, xm4, xm3 16319 paddd xm4, xm2 16320 punpckldq xm4, xm4, xm3 16321 punpckhqdq xm2, xm4, xm3 16322 paddd xm4, xm2 16323 16324 vextracti128 xm5, m7, 1 16325 vpaddd xm7, xm5 16326 punpckhqdq xm2, xm7, xm3 16327 paddd xm7, xm2 16328 punpckldq xm7, xm7, xm3 16329 punpckhqdq xm2, xm7, xm3 16330 paddd xm7, xm2 16331%endif 16332 movq [r4], xm4 16333 movq [r6], xm7 16334 RET 16335 16336 16337;static void normFact_c(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k) 16338;{ 16339; *z_k = 0; 16340; for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1) 16341; { 16342; for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1) 16343; { 16344; uint32_t temp = src[block_yy * blockSize + block_xx] >> shift; 16345; *z_k += temp * temp; 16346; } 16347; } 16348;} 16349;-------------------------------------------------------------------------------------- 16350; void normFact_c(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k) 16351;-------------------------------------------------------------------------------------- 16352INIT_YMM avx2 16353cglobal normFact8, 4, 5, 6 16354 mov r4d, 8 16355 vpxor m3, m3 ;z_k 16356 vpxor m5, m5 16357.row: 16358%if HIGH_BIT_DEPTH 16359 vpmovzxwd m0, [r0] ;src 16360%elif BIT_DEPTH == 8 16361 vpmovzxbd m0, [r0] 16362%else 16363 %error Unsupported BIT_DEPTH! 16364%endif 16365 16366 NORM_FACT_HIGH m0 16367 16368%if HIGH_BIT_DEPTH 16369 lea r0, [r0 + 2 * r1] 16370%else 16371 lea r0, [r0 + r1] 16372%endif 16373 dec r4d 16374 jnz .row 16375 vextracti128 xm4, m3, 1 16376 vpaddq xm3, xm4 16377 punpckhqdq xm2, xm3, xm5 16378 paddq xm3, xm2 16379 movq [r3], xm3 16380 RET 16381 16382 16383INIT_YMM avx2 16384cglobal normFact16, 4, 5, 6 16385 mov r4d, 16 16386 vpxor m3, m3 ;z_k 16387 vpxor m5, m5 16388.row: 16389%if HIGH_BIT_DEPTH 16390;Col 1-8 16391 vpmovzxwd m0, [r0] ;src 16392 16393 NORM_FACT_HIGH m0 16394 16395;Col 9-16 16396 vpmovzxwd m0, [r0 + 16] 16397 16398 NORM_FACT_HIGH m0 16399 16400 lea r0, [r0 + 2 * r1] 16401%elif BIT_DEPTH == 8 16402;col 1-16 16403 vpmovzxbw m0, [r0] ;src 16404 16405 NORM_FACT_LOW m0 16406 16407 lea r0, [r0 + r1] 16408%else 16409 %error Unsupported BIT_DEPTH! 16410%endif 16411 dec r4d 16412 jnz .row 16413 16414%if HIGH_BIT_DEPTH 16415 vextracti128 xm4, m3, 1 16416 vpaddq xm3, xm4 16417 punpckhqdq xm2, xm3, xm5 16418 paddq xm3, xm2 16419%else 16420 vextracti128 xm4, m3, 1 16421 vpaddd xm3, xm4 16422 punpckhqdq xm2, xm3, xm5 16423 paddd xm3, xm2 16424 punpckldq xm3, xm3, xm5 16425 punpckhqdq xm2, xm3, xm5 16426 paddd xm3, xm2 16427%endif 16428 movq [r3], xm3 16429 RET 16430 16431 16432INIT_YMM avx2 16433cglobal normFact32, 4, 5, 6 16434 mov r4d, 32 16435 vpxor m3, m3 ;z_k 16436 vpxor m5, m5 16437.row: 16438%if HIGH_BIT_DEPTH 16439;Col 1-8 16440 vpmovzxwd m0, [r0] ;src 16441 16442 NORM_FACT_HIGH m0 16443 16444;Col 9-16 16445 vpmovzxwd m0, [r0 + 16] 16446 16447 NORM_FACT_HIGH m0 16448 16449;Col 17-24 16450 vpmovzxwd m0, [r0 + 32] 16451 16452 NORM_FACT_HIGH m0 16453 16454;Col 25-32 16455 vpmovzxwd m0, [r0 + 48] 16456 16457 NORM_FACT_HIGH m0 16458 16459 lea r0, [r0 + 2 * r1] 16460%elif BIT_DEPTH == 8 16461;col 1-16 16462 vpmovzxbw m0, [r0] ;src 16463 16464 NORM_FACT_LOW m0 16465;col 17-32 16466 vpmovzxbw m0, [r0 + 16] 16467 16468 NORM_FACT_LOW m0 16469 16470 lea r0, [r0 + r1] 16471%else 16472 %error Unsupported BIT_DEPTH! 16473%endif 16474 dec r4d 16475 jnz .row 16476 16477%if HIGH_BIT_DEPTH 16478 vextracti128 xm4, m3, 1 16479 vpaddq xm3, xm4 16480 punpckhqdq xm2, xm3, xm5 16481 paddq xm3, xm2 16482%else 16483 vextracti128 xm4, m3, 1 16484 vpaddd xm3, xm4 16485 punpckhqdq xm2, xm3, xm5 16486 paddd xm3, xm2 16487 punpckldq xm3, xm3, xm5 16488 punpckhqdq xm2, xm3, xm5 16489 paddd xm3, xm2 16490%endif 16491 movq [r3], xm3 16492 RET 16493 16494 16495INIT_YMM avx2 16496cglobal normFact64, 4, 5, 6 16497 mov r4d, 64 16498 vpxor m3, m3 ;z_k 16499 vpxor m5, m5 16500.row: 16501%if HIGH_BIT_DEPTH 16502;Col 1-8 16503 vpmovzxwd m0, [r0] ;src 16504 16505 NORM_FACT_HIGH m0 16506 16507;Col 9-16 16508 vpmovzxwd m0, [r0 + 16] 16509 16510 NORM_FACT_HIGH m0 16511 16512;Col 17-24 16513 vpmovzxwd m0, [r0 + 32] 16514 16515 NORM_FACT_HIGH m0 16516 16517;Col 25-32 16518 vpmovzxwd m0, [r0 + 48] 16519 16520 NORM_FACT_HIGH m0 16521 16522;Col 33-40 16523 vpmovzxwd m0, [r0 + 64] 16524 16525 NORM_FACT_HIGH m0 16526 16527;Col 41-48 16528 vpmovzxwd m0, [r0 + 80] 16529 16530 NORM_FACT_HIGH m0 16531 16532;Col 49-56 16533 vpmovzxwd m0, [r0 + 96] 16534 16535 NORM_FACT_HIGH m0 16536 16537;Col 57-64 16538 vpmovzxwd m0, [r0 + 112] 16539 16540 NORM_FACT_HIGH m0 16541 16542 lea r0, [r0 + 2 * r1] 16543%elif BIT_DEPTH == 8 16544;col 1-16 16545 vpmovzxbw m0, [r0] ;src 16546 16547 NORM_FACT_LOW m0 16548;col 17-32 16549 vpmovzxbw m0, [r0 + 16] 16550 16551 NORM_FACT_LOW m0 16552;col 33-48 16553 vpmovzxbw m0, [r0 + 32] 16554 16555 NORM_FACT_LOW m0 16556;col 49-56 16557 vpmovzxbw m0, [r0 + 48] 16558 16559 NORM_FACT_LOW m0 16560 16561 lea r0, [r0 + r1] 16562%else 16563 %error Unsupported BIT_DEPTH! 16564%endif 16565 dec r4d 16566 jnz .row 16567 16568%if HIGH_BIT_DEPTH 16569 vextracti128 xm4, m3, 1 16570 vpaddq xm3, xm4 16571 punpckhqdq xm2, xm3, xm5 16572 paddq xm3, xm2 16573%else 16574 vextracti128 xm4, m3, 1 16575 vpaddd xm3, xm4 16576 punpckhqdq xm2, xm3, xm5 16577 paddd xm3, xm2 16578 punpckldq xm3, xm3, xm5 16579 punpckhqdq xm2, xm3, xm5 16580 paddd xm3, xm2 16581%endif 16582 movq [r3], xm3 16583 RET 16584