1;***************************************************************************** 2;* sad16-a.asm: x86 high depth sad functions 3;***************************************************************************** 4;* Copyright (C) 2010-2021 x264 project 5;* 6;* Authors: Oskar Arvidsson <oskar@irock.se> 7;* Henrik Gramner <henrik@gramner.com> 8;* 9;* This program is free software; you can redistribute it and/or modify 10;* it under the terms of the GNU General Public License as published by 11;* the Free Software Foundation; either version 2 of the License, or 12;* (at your option) any later version. 13;* 14;* This program is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17;* GNU General Public License for more details. 18;* 19;* You should have received a copy of the GNU General Public License 20;* along with this program; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 22;* 23;* This program is also available under a commercial proprietary license. 24;* For more information, contact us at licensing@x264.com. 25;***************************************************************************** 26 27%include "x86inc.asm" 28%include "x86util.asm" 29 30SECTION .text 31 32cextern pw_1 33cextern pw_4 34cextern pw_8 35 36;============================================================================= 37; SAD MMX 38;============================================================================= 39 40%macro SAD_INC_1x16P_MMX 0 41 movu m1, [r0+ 0] 42 movu m2, [r0+ 8] 43 movu m3, [r0+16] 44 movu m4, [r0+24] 45 psubw m1, [r2+ 0] 46 psubw m2, [r2+ 8] 47 psubw m3, [r2+16] 48 psubw m4, [r2+24] 49 ABSW2 m1, m2, m1, m2, m5, m6 50 ABSW2 m3, m4, m3, m4, m7, m5 51 lea r0, [r0+2*r1] 52 lea r2, [r2+2*r3] 53 paddw m1, m2 54 paddw m3, m4 55 paddw m0, m1 56 paddw m0, m3 57%endmacro 58 59%macro SAD_INC_2x8P_MMX 0 60 movu m1, [r0+0] 61 movu m2, [r0+8] 62 movu m3, [r0+2*r1+0] 63 movu m4, [r0+2*r1+8] 64 psubw m1, [r2+0] 65 psubw m2, [r2+8] 66 psubw m3, [r2+2*r3+0] 67 psubw m4, [r2+2*r3+8] 68 ABSW2 m1, m2, m1, m2, m5, m6 69 ABSW2 m3, m4, m3, m4, m7, m5 70 lea r0, [r0+4*r1] 71 lea r2, [r2+4*r3] 72 paddw m1, m2 73 paddw m3, m4 74 paddw m0, m1 75 paddw m0, m3 76%endmacro 77 78%macro SAD_INC_2x4P_MMX 0 79 movu m1, [r0] 80 movu m2, [r0+2*r1] 81 psubw m1, [r2] 82 psubw m2, [r2+2*r3] 83 ABSW2 m1, m2, m1, m2, m3, m4 84 lea r0, [r0+4*r1] 85 lea r2, [r2+4*r3] 86 paddw m0, m1 87 paddw m0, m2 88%endmacro 89 90;----------------------------------------------------------------------------- 91; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t ) 92;----------------------------------------------------------------------------- 93%macro SAD_MMX 3 94cglobal pixel_sad_%1x%2, 4,5-(%2&4/4) 95 pxor m0, m0 96%if %2 == 4 97 SAD_INC_%3x%1P_MMX 98 SAD_INC_%3x%1P_MMX 99%else 100 mov r4d, %2/%3 101.loop: 102 SAD_INC_%3x%1P_MMX 103 dec r4d 104 jg .loop 105%endif 106%if %1*%2 == 256 107 HADDUW m0, m1 108%else 109 HADDW m0, m1 110%endif 111 movd eax, m0 112 RET 113%endmacro 114 115INIT_MMX mmx2 116SAD_MMX 16, 16, 1 117SAD_MMX 16, 8, 1 118SAD_MMX 8, 16, 2 119SAD_MMX 8, 8, 2 120SAD_MMX 8, 4, 2 121SAD_MMX 4, 8, 2 122SAD_MMX 4, 4, 2 123INIT_MMX ssse3 124SAD_MMX 4, 8, 2 125SAD_MMX 4, 4, 2 126 127;============================================================================= 128; SAD XMM 129;============================================================================= 130 131%macro SAD_INC_2ROW 1 132%if 2*%1 > mmsize 133 movu m1, [r2+ 0] 134 movu m2, [r2+16] 135 movu m3, [r2+2*r3+ 0] 136 movu m4, [r2+2*r3+16] 137 psubw m1, [r0+ 0] 138 psubw m2, [r0+16] 139 psubw m3, [r0+2*r1+ 0] 140 psubw m4, [r0+2*r1+16] 141 ABSW2 m1, m2, m1, m2, m5, m6 142 lea r0, [r0+4*r1] 143 lea r2, [r2+4*r3] 144 ABSW2 m3, m4, m3, m4, m7, m5 145 paddw m1, m2 146 paddw m3, m4 147 paddw m0, m1 148 paddw m0, m3 149%else 150 movu m1, [r2] 151 movu m2, [r2+2*r3] 152 psubw m1, [r0] 153 psubw m2, [r0+2*r1] 154 ABSW2 m1, m2, m1, m2, m3, m4 155 lea r0, [r0+4*r1] 156 lea r2, [r2+4*r3] 157 paddw m0, m1 158 paddw m0, m2 159%endif 160%endmacro 161 162;----------------------------------------------------------------------------- 163; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t ) 164;----------------------------------------------------------------------------- 165%macro SAD 2 166cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize) 167 pxor m0, m0 168%if %2 == 4 169 SAD_INC_2ROW %1 170 SAD_INC_2ROW %1 171%else 172 mov r4d, %2/2 173.loop: 174 SAD_INC_2ROW %1 175 dec r4d 176 jg .loop 177%endif 178 HADDW m0, m1 179 movd eax, xm0 180 RET 181%endmacro 182 183INIT_XMM sse2 184SAD 16, 16 185SAD 16, 8 186SAD 8, 16 187SAD 8, 8 188SAD 8, 4 189INIT_XMM sse2, aligned 190SAD 16, 16 191SAD 16, 8 192SAD 8, 16 193SAD 8, 8 194INIT_XMM ssse3 195SAD 16, 16 196SAD 16, 8 197SAD 8, 16 198SAD 8, 8 199SAD 8, 4 200INIT_XMM ssse3, aligned 201SAD 16, 16 202SAD 16, 8 203SAD 8, 16 204SAD 8, 8 205INIT_YMM avx2 206SAD 16, 16 207SAD 16, 8 208 209;============================================================================= 210; SAD x3/x4 211;============================================================================= 212 213%macro SAD_X3_INC_P 0 214 add r0, 4*FENC_STRIDE 215 lea r1, [r1+4*r4] 216 lea r2, [r2+4*r4] 217 lea r3, [r3+4*r4] 218%endmacro 219 220%macro SAD_X3_ONE_START 0 221 mova m3, [r0] 222 movu m0, [r1] 223 movu m1, [r2] 224 movu m2, [r3] 225 psubw m0, m3 226 psubw m1, m3 227 psubw m2, m3 228 ABSW2 m0, m1, m0, m1, m4, m5 229 ABSW m2, m2, m6 230%endmacro 231 232%macro SAD_X3_ONE 2 233 mova m6, [r0+%1] 234 movu m3, [r1+%2] 235 movu m4, [r2+%2] 236 movu m5, [r3+%2] 237 psubw m3, m6 238 psubw m4, m6 239 psubw m5, m6 240 ABSW2 m3, m4, m3, m4, m7, m6 241 ABSW m5, m5, m6 242 paddw m0, m3 243 paddw m1, m4 244 paddw m2, m5 245%endmacro 246 247%macro SAD_X3_END 2 248%if mmsize == 8 && %1*%2 == 256 249 HADDUW m0, m3 250 HADDUW m1, m4 251 HADDUW m2, m5 252%else 253 HADDW m0, m3 254 HADDW m1, m4 255 HADDW m2, m5 256%endif 257%if UNIX64 258 movd [r5+0], xm0 259 movd [r5+4], xm1 260 movd [r5+8], xm2 261%else 262 mov r0, r5mp 263 movd [r0+0], xm0 264 movd [r0+4], xm1 265 movd [r0+8], xm2 266%endif 267 RET 268%endmacro 269 270%macro SAD_X4_INC_P 0 271 add r0, 4*FENC_STRIDE 272 lea r1, [r1+4*r5] 273 lea r2, [r2+4*r5] 274 lea r3, [r3+4*r5] 275 lea r4, [r4+4*r5] 276%endmacro 277 278%macro SAD_X4_ONE_START 0 279 mova m4, [r0] 280 movu m0, [r1] 281 movu m1, [r2] 282 movu m2, [r3] 283 movu m3, [r4] 284 psubw m0, m4 285 psubw m1, m4 286 psubw m2, m4 287 psubw m3, m4 288 ABSW2 m0, m1, m0, m1, m5, m6 289 ABSW2 m2, m3, m2, m3, m4, m7 290%endmacro 291 292%macro SAD_X4_ONE 2 293 mova m4, [r0+%1] 294 movu m5, [r1+%2] 295 movu m6, [r2+%2] 296%if num_mmregs > 8 297 movu m7, [r3+%2] 298 movu m8, [r4+%2] 299 psubw m5, m4 300 psubw m6, m4 301 psubw m7, m4 302 psubw m8, m4 303 ABSW2 m5, m6, m5, m6, m9, m10 304 ABSW2 m7, m8, m7, m8, m9, m10 305 paddw m0, m5 306 paddw m1, m6 307 paddw m2, m7 308 paddw m3, m8 309%elif cpuflag(ssse3) 310 movu m7, [r3+%2] 311 psubw m5, m4 312 psubw m6, m4 313 psubw m7, m4 314 movu m4, [r4+%2] 315 pabsw m5, m5 316 psubw m4, [r0+%1] 317 pabsw m6, m6 318 pabsw m7, m7 319 pabsw m4, m4 320 paddw m0, m5 321 paddw m1, m6 322 paddw m2, m7 323 paddw m3, m4 324%else ; num_mmregs == 8 && !ssse3 325 psubw m5, m4 326 psubw m6, m4 327 ABSW m5, m5, m7 328 ABSW m6, m6, m7 329 paddw m0, m5 330 paddw m1, m6 331 movu m5, [r3+%2] 332 movu m6, [r4+%2] 333 psubw m5, m4 334 psubw m6, m4 335 ABSW2 m5, m6, m5, m6, m7, m4 336 paddw m2, m5 337 paddw m3, m6 338%endif 339%endmacro 340 341%macro SAD_X4_END 2 342%if mmsize == 8 && %1*%2 == 256 343 HADDUW m0, m4 344 HADDUW m1, m5 345 HADDUW m2, m6 346 HADDUW m3, m7 347%else 348 HADDW m0, m4 349 HADDW m1, m5 350 HADDW m2, m6 351 HADDW m3, m7 352%endif 353 mov r0, r6mp 354 movd [r0+ 0], xm0 355 movd [r0+ 4], xm1 356 movd [r0+ 8], xm2 357 movd [r0+12], xm3 358 RET 359%endmacro 360 361%macro SAD_X_2xNP 4 362 %assign x %3 363%rep %4 364 SAD_X%1_ONE x*mmsize, x*mmsize 365 SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize 366 %assign x x+1 367%endrep 368%endmacro 369 370%macro PIXEL_VSAD 0 371cglobal pixel_vsad, 3,3,8 372 mova m0, [r0] 373 mova m1, [r0+16] 374 mova m2, [r0+2*r1] 375 mova m3, [r0+2*r1+16] 376 lea r0, [r0+4*r1] 377 psubw m0, m2 378 psubw m1, m3 379 ABSW2 m0, m1, m0, m1, m4, m5 380 paddw m0, m1 381 sub r2d, 2 382 je .end 383.loop: 384 mova m4, [r0] 385 mova m5, [r0+16] 386 mova m6, [r0+2*r1] 387 mova m7, [r0+2*r1+16] 388 lea r0, [r0+4*r1] 389 psubw m2, m4 390 psubw m3, m5 391 psubw m4, m6 392 psubw m5, m7 393 ABSW m2, m2, m1 394 ABSW m3, m3, m1 395 ABSW m4, m4, m1 396 ABSW m5, m5, m1 397 paddw m0, m2 398 paddw m0, m3 399 paddw m0, m4 400 paddw m0, m5 401 mova m2, m6 402 mova m3, m7 403 sub r2d, 2 404 jg .loop 405.end: 406%if BIT_DEPTH == 9 407 HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682 408%else 409 HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426 410%endif 411 movd eax, m0 412 RET 413%endmacro 414INIT_XMM sse2 415PIXEL_VSAD 416INIT_XMM ssse3 417PIXEL_VSAD 418INIT_XMM xop 419PIXEL_VSAD 420 421INIT_YMM avx2 422cglobal pixel_vsad, 3,3 423 mova m0, [r0] 424 mova m1, [r0+2*r1] 425 lea r0, [r0+4*r1] 426 psubw m0, m1 427 pabsw m0, m0 428 sub r2d, 2 429 je .end 430.loop: 431 mova m2, [r0] 432 mova m3, [r0+2*r1] 433 lea r0, [r0+4*r1] 434 psubw m1, m2 435 psubw m2, m3 436 pabsw m1, m1 437 pabsw m2, m2 438 paddw m0, m1 439 paddw m0, m2 440 mova m1, m3 441 sub r2d, 2 442 jg .loop 443.end: 444%if BIT_DEPTH == 9 445 HADDW m0, m1 446%else 447 HADDUW m0, m1 448%endif 449 movd eax, xm0 450 RET 451 452;----------------------------------------------------------------------------- 453; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1, 454; uint16_t *pix2, intptr_t i_stride, int scores[3] ) 455;----------------------------------------------------------------------------- 456%macro SAD_X 3 457cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS 458 %assign regnum %1+1 459 %xdefine STRIDE r %+ regnum 460 mov r6, %3/2-1 461 SAD_X%1_ONE_START 462 SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE 463 SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1 464.loop: 465 SAD_X%1_INC_P 466 SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2) 467 dec r6 468 jg .loop 469%if %1 == 4 470 mov r6, r6m 471%endif 472 SAD_X%1_END %2, %3 473%endmacro 474 475INIT_MMX mmx2 476%define XMM_REGS 0 477SAD_X 3, 16, 16 478SAD_X 3, 16, 8 479SAD_X 3, 8, 16 480SAD_X 3, 8, 8 481SAD_X 3, 8, 4 482SAD_X 3, 4, 8 483SAD_X 3, 4, 4 484SAD_X 4, 16, 16 485SAD_X 4, 16, 8 486SAD_X 4, 8, 16 487SAD_X 4, 8, 8 488SAD_X 4, 8, 4 489SAD_X 4, 4, 8 490SAD_X 4, 4, 4 491INIT_MMX ssse3 492SAD_X 3, 4, 8 493SAD_X 3, 4, 4 494SAD_X 4, 4, 8 495SAD_X 4, 4, 4 496INIT_XMM ssse3 497%define XMM_REGS 7 498SAD_X 3, 16, 16 499SAD_X 3, 16, 8 500SAD_X 3, 8, 16 501SAD_X 3, 8, 8 502SAD_X 3, 8, 4 503%define XMM_REGS 9 504SAD_X 4, 16, 16 505SAD_X 4, 16, 8 506SAD_X 4, 8, 16 507SAD_X 4, 8, 8 508SAD_X 4, 8, 4 509INIT_XMM sse2 510%define XMM_REGS 8 511SAD_X 3, 16, 16 512SAD_X 3, 16, 8 513SAD_X 3, 8, 16 514SAD_X 3, 8, 8 515SAD_X 3, 8, 4 516%define XMM_REGS 11 517SAD_X 4, 16, 16 518SAD_X 4, 16, 8 519SAD_X 4, 8, 16 520SAD_X 4, 8, 8 521SAD_X 4, 8, 4 522INIT_XMM xop 523%define XMM_REGS 7 524SAD_X 3, 16, 16 525SAD_X 3, 16, 8 526SAD_X 3, 8, 16 527SAD_X 3, 8, 8 528SAD_X 3, 8, 4 529%define XMM_REGS 9 530SAD_X 4, 16, 16 531SAD_X 4, 16, 8 532SAD_X 4, 8, 16 533SAD_X 4, 8, 8 534SAD_X 4, 8, 4 535INIT_YMM avx2 536%define XMM_REGS 7 537SAD_X 3, 16, 16 538SAD_X 3, 16, 8 539%define XMM_REGS 9 540SAD_X 4, 16, 16 541SAD_X 4, 16, 8 542 543;----------------------------------------------------------------------------- 544; void intra_sad_x3_4x4( uint16_t *fenc, uint16_t *fdec, int res[3] ); 545;----------------------------------------------------------------------------- 546 547%macro INTRA_SAD_X3_4x4 0 548cglobal intra_sad_x3_4x4, 3,3,7 549%if cpuflag(ssse3) 550 movddup m0, [r1-1*FDEC_STRIDEB] 551%else 552 movq m0, [r1-1*FDEC_STRIDEB] 553 punpcklqdq m0, m0 554%endif 555 movq m1, [r0+0*FENC_STRIDEB] 556 movq m2, [r0+2*FENC_STRIDEB] 557 pshuflw m6, m0, q1032 558 paddw m6, m0 559 pshuflw m5, m6, q2301 560 paddw m6, m5 561 punpcklqdq m6, m6 ; A+B+C+D 8 times 562 movhps m1, [r0+1*FENC_STRIDEB] 563 movhps m2, [r0+3*FENC_STRIDEB] 564 psubw m3, m1, m0 565 psubw m0, m2 566 ABSW2 m3, m0, m3, m0, m4, m5 567 paddw m0, m3 568 movd m3, [r1+0*FDEC_STRIDEB-4] 569 movd m4, [r1+2*FDEC_STRIDEB-4] 570 movhps m3, [r1+1*FDEC_STRIDEB-8] 571 movhps m4, [r1+3*FDEC_STRIDEB-8] 572 pshufhw m3, m3, q3333 573 pshufhw m4, m4, q3333 574 pshuflw m3, m3, q1111 ; FF FF EE EE 575 pshuflw m4, m4, q1111 ; HH HH GG GG 576 paddw m5, m3, m4 577 paddw m6, [pw_4] 578 paddw m6, m5 579 pshufd m5, m5, q1032 580 paddw m5, m6 581 psrlw m5, 3 582 psubw m6, m5, m2 583 psubw m5, m1 584 psubw m1, m3 585 psubw m2, m4 586 ABSW2 m5, m6, m5, m6, m3, m4 587 ABSW2 m1, m2, m1, m2, m3, m4 588 paddw m5, m6 589 paddw m1, m2 590%if cpuflag(ssse3) 591 phaddw m0, m1 592 movhlps m3, m5 593 paddw m5, m3 594 phaddw m0, m5 595 pmaddwd m0, [pw_1] 596 mova [r2], m0 597%else 598 HADDW m0, m3 599 HADDW m1, m3 600 HADDW m5, m3 601 movd [r2], m0 ; V prediction cost 602 movd [r2+4], m1 ; H prediction cost 603 movd [r2+8], m5 ; DC prediction cost 604%endif 605 RET 606%endmacro 607 608INIT_XMM sse2 609INTRA_SAD_X3_4x4 610INIT_XMM ssse3 611INTRA_SAD_X3_4x4 612INIT_XMM avx 613INTRA_SAD_X3_4x4 614 615;----------------------------------------------------------------------------- 616; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3] ); 617;----------------------------------------------------------------------------- 618 619;m0 = DC 620;m6 = V 621;m7 = H 622;m1 = DC score 623;m2 = V score 624;m3 = H score 625;m5 = temp 626;m4 = pixel row 627 628%macro INTRA_SAD_HVDC_ITER 2 629 mova m4, [r0+(%1-4)*FENC_STRIDEB] 630 psubw m4, m0 631 ABSW m4, m4, m5 632 ACCUM paddw, 1, 4, %1 633 mova m4, [r0+(%1-4)*FENC_STRIDEB] 634 psubw m4, m6 635 ABSW m4, m4, m5 636 ACCUM paddw, 2, 4, %1 637 pshufd m5, m7, %2 638 psubw m5, [r0+(%1-4)*FENC_STRIDEB] 639 ABSW m5, m5, m4 640 ACCUM paddw, 3, 5, %1 641%endmacro 642 643%macro INTRA_SAD_X3_8x8 0 644cglobal intra_sad_x3_8x8, 3,3,8 645 add r0, 4*FENC_STRIDEB 646 movu m0, [r1+7*SIZEOF_PIXEL] 647 mova m6, [r1+16*SIZEOF_PIXEL] ;V prediction 648 mova m7, m0 649 paddw m0, m6 650 punpckhwd m7, m7 651 HADDW m0, m4 652 paddw m0, [pw_8] 653 psrlw m0, 4 654 SPLATW m0, m0 655 INTRA_SAD_HVDC_ITER 0, q3333 656 INTRA_SAD_HVDC_ITER 1, q2222 657 INTRA_SAD_HVDC_ITER 2, q1111 658 INTRA_SAD_HVDC_ITER 3, q0000 659 movq m7, [r1+7*SIZEOF_PIXEL] 660 punpcklwd m7, m7 661 INTRA_SAD_HVDC_ITER 4, q3333 662 INTRA_SAD_HVDC_ITER 5, q2222 663 INTRA_SAD_HVDC_ITER 6, q1111 664 INTRA_SAD_HVDC_ITER 7, q0000 665%if cpuflag(ssse3) 666 phaddw m2, m3 ; 2 2 2 2 3 3 3 3 667 movhlps m3, m1 668 paddw m1, m3 ; 1 1 1 1 _ _ _ _ 669 phaddw m2, m1 ; 2 2 3 3 1 1 _ _ 670 pmaddwd m2, [pw_1] ; 2 3 1 _ 671 mova [r2], m2 672%else 673 HADDW m2, m4 674 HADDW m3, m4 675 HADDW m1, m4 676 movd [r2+0], m2 677 movd [r2+4], m3 678 movd [r2+8], m1 679%endif 680 RET 681%endmacro 682 683INIT_XMM sse2 684INTRA_SAD_X3_8x8 685INIT_XMM ssse3 686INTRA_SAD_X3_8x8 687 688%macro INTRA_SAD_HVDC_ITER_YMM 2 689 mova xm4, [r0+(%1-4)*FENC_STRIDEB] 690 vinserti128 m4, m4, [r0+%1*FENC_STRIDEB], 1 691 pshufd m5, m7, %2 692 psubw m5, m4 693 pabsw m5, m5 694 ACCUM paddw, 2, 5, %1 ; H 695 psubw m5, m4, m6 696 psubw m4, m0 697 pabsw m5, m5 698 pabsw m4, m4 699 ACCUM paddw, 1, 5, %1 ; V 700 ACCUM paddw, 3, 4, %1 ; DC 701%endmacro 702 703INIT_YMM avx2 704cglobal intra_sad_x3_8x8, 3,3,8 705 add r0, 4*FENC_STRIDEB 706 movu xm0, [r1+7*SIZEOF_PIXEL] 707 vbroadcasti128 m6, [r1+16*SIZEOF_PIXEL] ; V prediction 708 vpermq m7, m0, q0011 709 paddw xm0, xm6 710 paddw xm0, [pw_1] ; equal to +8 after HADDW 711 HADDW xm0, xm4 712 psrld xm0, 4 713 vpbroadcastw m0, xm0 714 punpcklwd m7, m7 715 INTRA_SAD_HVDC_ITER_YMM 0, q3333 716 INTRA_SAD_HVDC_ITER_YMM 1, q2222 717 INTRA_SAD_HVDC_ITER_YMM 2, q1111 718 INTRA_SAD_HVDC_ITER_YMM 3, q0000 719 phaddw m1, m2 ; 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2 720 punpckhqdq m2, m3, m3 721 paddw m3, m2 ; 3 3 3 3 _ _ _ _ 3 3 3 3 _ _ _ _ 722 phaddw m1, m3 ; 1 1 2 2 3 3 _ _ 1 1 2 2 3 3 _ _ 723 vextracti128 xm2, m1, 1 724 paddw xm1, xm2 ; 1 1 2 2 3 3 _ _ 725 pmaddwd xm1, [pw_1] ; 1 2 3 _ 726 mova [r2], xm1 727 RET 728