1;****************************************************************************** 2;* VP9 Intra prediction SIMD optimizations 3;* 4;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> 5;* 6;* Parts based on: 7;* H.264 intra prediction asm optimizations 8;* Copyright (c) 2010 Fiona Glaser 9;* Copyright (c) 2010 Holger Lubitz 10;* Copyright (c) 2010 Loren Merritt 11;* Copyright (c) 2010 Ronald S. Bultje 12;* 13;* This file is part of FFmpeg. 14;* 15;* FFmpeg is free software; you can redistribute it and/or 16;* modify it under the terms of the GNU Lesser General Public 17;* License as published by the Free Software Foundation; either 18;* version 2.1 of the License, or (at your option) any later version. 19;* 20;* FFmpeg is distributed in the hope that it will be useful, 21;* but WITHOUT ANY WARRANTY; without even the implied warranty of 22;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 23;* Lesser General Public License for more details. 24;* 25;* You should have received a copy of the GNU Lesser General Public 26;* License along with FFmpeg; if not, write to the Free Software 27;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 28;****************************************************************************** 29 30%include "libavutil/x86/x86util.asm" 31 32SECTION_RODATA 32 33 34pw_m256: times 16 dw -256 35pw_m255: times 16 dw -255 36pw_4096: times 8 dw 4096 37 38pb_4x3_4x2_4x1_4x0: times 4 db 3 39 times 4 db 2 40 times 4 db 1 41 times 4 db 0 42pb_8x1_8x0: times 8 db 1 43 times 8 db 0 44pb_8x3_8x2: times 8 db 3 45 times 8 db 2 46pb_0to5_2x7: db 0, 1, 2, 3, 4, 5, 7, 7 47 times 8 db -1 48pb_0to6_9x7: db 0, 1, 2, 3, 4, 5, 6 49 times 9 db 7 50pb_1to6_10x7: db 1, 2, 3, 4, 5, 6 51 times 10 db 7 52pb_2to6_3x7: 53pb_2to6_11x7: db 2, 3, 4, 5, 6 54 times 11 db 7 55pb_1toE_2xF: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 56pb_2toE_3xF: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 57pb_13456_3xm1: db 1, 3, 4, 5, 6 58 times 3 db -1 59pb_6012_4xm1: db 6, 0, 1, 2 60 times 4 db -1 61pb_6xm1_246_8toE: times 6 db -1 62 db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14 63pb_6xm1_BDF_0to6: times 6 db -1 64 db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6 65pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 66 67pb_15x0_1xm1: times 15 db 0 68 db -1 69pb_0to2_5x3: db 0, 1, 2 70 times 5 db 3 71pb_6xm1_2x0: times 6 db -1 72 times 2 db 0 73pb_6x0_2xm1: times 6 db 0 74 times 2 db -1 75 76cextern pb_1 77cextern pb_2 78cextern pb_3 79cextern pb_15 80cextern pw_2 81cextern pw_4 82cextern pw_8 83cextern pw_16 84cextern pw_32 85cextern pw_255 86cextern pw_512 87cextern pw_1024 88cextern pw_2048 89cextern pw_8192 90 91SECTION .text 92 93; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) 94 95%macro DC_4to8_FUNCS 0 96cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a 97 movd m0, [lq] 98 punpckldq m0, [aq] 99 pxor m1, m1 100 psadbw m0, m1 101%if cpuflag(ssse3) 102 pmulhrsw m0, [pw_4096] 103 pshufb m0, m1 104%else 105 paddw m0, [pw_4] 106 psraw m0, 3 107 punpcklbw m0, m0 108 pshufw m0, m0, q0000 109%endif 110 movd [dstq+strideq*0], m0 111 movd [dstq+strideq*1], m0 112 lea dstq, [dstq+strideq*2] 113 movd [dstq+strideq*0], m0 114 movd [dstq+strideq*1], m0 115 RET 116 117cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a 118 movq m0, [lq] 119 movq m1, [aq] 120 DEFINE_ARGS dst, stride, stride3 121 lea stride3q, [strideq*3] 122 pxor m2, m2 123 psadbw m0, m2 124 psadbw m1, m2 125 paddw m0, m1 126%if cpuflag(ssse3) 127 pmulhrsw m0, [pw_2048] 128 pshufb m0, m2 129%else 130 paddw m0, [pw_8] 131 psraw m0, 4 132 punpcklbw m0, m0 133 pshufw m0, m0, q0000 134%endif 135 movq [dstq+strideq*0], m0 136 movq [dstq+strideq*1], m0 137 movq [dstq+strideq*2], m0 138 movq [dstq+stride3q ], m0 139 lea dstq, [dstq+strideq*4] 140 movq [dstq+strideq*0], m0 141 movq [dstq+strideq*1], m0 142 movq [dstq+strideq*2], m0 143 movq [dstq+stride3q ], m0 144 RET 145%endmacro 146 147INIT_MMX mmxext 148DC_4to8_FUNCS 149INIT_MMX ssse3 150DC_4to8_FUNCS 151 152%macro DC_16to32_FUNCS 0 153cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a 154 mova m0, [lq] 155 mova m1, [aq] 156 DEFINE_ARGS dst, stride, stride3, cnt 157 lea stride3q, [strideq*3] 158 pxor m2, m2 159 psadbw m0, m2 160 psadbw m1, m2 161 paddw m0, m1 162 movhlps m1, m0 163 paddw m0, m1 164%if cpuflag(ssse3) 165 pmulhrsw m0, [pw_1024] 166 pshufb m0, m2 167%else 168 paddw m0, [pw_16] 169 psraw m0, 5 170 punpcklbw m0, m0 171 pshuflw m0, m0, q0000 172 punpcklqdq m0, m0 173%endif 174 mov cntd, 4 175.loop: 176 mova [dstq+strideq*0], m0 177 mova [dstq+strideq*1], m0 178 mova [dstq+strideq*2], m0 179 mova [dstq+stride3q ], m0 180 lea dstq, [dstq+strideq*4] 181 dec cntd 182 jg .loop 183 RET 184 185cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a 186 mova m0, [lq] 187 mova m1, [lq+16] 188 mova m2, [aq] 189 mova m3, [aq+16] 190 DEFINE_ARGS dst, stride, stride3, cnt 191 lea stride3q, [strideq*3] 192 pxor m4, m4 193 psadbw m0, m4 194 psadbw m1, m4 195 psadbw m2, m4 196 psadbw m3, m4 197 paddw m0, m1 198 paddw m2, m3 199 paddw m0, m2 200 movhlps m1, m0 201 paddw m0, m1 202%if cpuflag(ssse3) 203 pmulhrsw m0, [pw_512] 204 pshufb m0, m4 205%else 206 paddw m0, [pw_32] 207 psraw m0, 6 208 punpcklbw m0, m0 209 pshuflw m0, m0, q0000 210 punpcklqdq m0, m0 211%endif 212 mov cntd, 8 213.loop: 214 mova [dstq+strideq*0+ 0], m0 215 mova [dstq+strideq*0+16], m0 216 mova [dstq+strideq*1+ 0], m0 217 mova [dstq+strideq*1+16], m0 218 mova [dstq+strideq*2+ 0], m0 219 mova [dstq+strideq*2+16], m0 220 mova [dstq+stride3q + 0], m0 221 mova [dstq+stride3q +16], m0 222 lea dstq, [dstq+strideq*4] 223 dec cntd 224 jg .loop 225 RET 226%endmacro 227 228INIT_XMM sse2 229DC_16to32_FUNCS 230INIT_XMM ssse3 231DC_16to32_FUNCS 232 233%if HAVE_AVX2_EXTERNAL 234INIT_YMM avx2 235cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a 236 mova m0, [lq] 237 mova m1, [aq] 238 DEFINE_ARGS dst, stride, stride3, cnt 239 lea stride3q, [strideq*3] 240 pxor m2, m2 241 psadbw m0, m2 242 psadbw m1, m2 243 paddw m0, m1 244 vextracti128 xm1, m0, 1 245 paddw xm0, xm1 246 movhlps xm1, xm0 247 paddw xm0, xm1 248 pmulhrsw xm0, [pw_512] 249 vpbroadcastb m0, xm0 250 mov cntd, 4 251.loop: 252 mova [dstq+strideq*0], m0 253 mova [dstq+strideq*1], m0 254 mova [dstq+strideq*2], m0 255 mova [dstq+stride3q ], m0 256 lea dstq, [dstq+strideq*4] 257 mova [dstq+strideq*0], m0 258 mova [dstq+strideq*1], m0 259 mova [dstq+strideq*2], m0 260 mova [dstq+stride3q ], m0 261 lea dstq, [dstq+strideq*4] 262 dec cntd 263 jg .loop 264 RET 265%endif 266 267; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) 268 269%macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l) 270cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a 271 movd m0, [%2q] 272 pxor m1, m1 273 psadbw m0, m1 274%if cpuflag(ssse3) 275 pmulhrsw m0, [pw_8192] 276 pshufb m0, m1 277%else 278 paddw m0, [pw_2] 279 psraw m0, 2 280 punpcklbw m0, m0 281 pshufw m0, m0, q0000 282%endif 283 movd [dstq+strideq*0], m0 284 movd [dstq+strideq*1], m0 285 lea dstq, [dstq+strideq*2] 286 movd [dstq+strideq*0], m0 287 movd [dstq+strideq*1], m0 288 RET 289 290cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a 291 movq m0, [%2q] 292 DEFINE_ARGS dst, stride, stride3 293 lea stride3q, [strideq*3] 294 pxor m1, m1 295 psadbw m0, m1 296%if cpuflag(ssse3) 297 pmulhrsw m0, [pw_4096] 298 pshufb m0, m1 299%else 300 paddw m0, [pw_4] 301 psraw m0, 3 302 punpcklbw m0, m0 303 pshufw m0, m0, q0000 304%endif 305 movq [dstq+strideq*0], m0 306 movq [dstq+strideq*1], m0 307 movq [dstq+strideq*2], m0 308 movq [dstq+stride3q ], m0 309 lea dstq, [dstq+strideq*4] 310 movq [dstq+strideq*0], m0 311 movq [dstq+strideq*1], m0 312 movq [dstq+strideq*2], m0 313 movq [dstq+stride3q ], m0 314 RET 315%endmacro 316 317INIT_MMX mmxext 318DC_1D_4to8_FUNCS top, a 319DC_1D_4to8_FUNCS left, l 320INIT_MMX ssse3 321DC_1D_4to8_FUNCS top, a 322DC_1D_4to8_FUNCS left, l 323 324%macro DC_1D_16to32_FUNCS 2; dir (top or left), arg (a or l) 325cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a 326 mova m0, [%2q] 327 DEFINE_ARGS dst, stride, stride3, cnt 328 lea stride3q, [strideq*3] 329 pxor m2, m2 330 psadbw m0, m2 331 movhlps m1, m0 332 paddw m0, m1 333%if cpuflag(ssse3) 334 pmulhrsw m0, [pw_2048] 335 pshufb m0, m2 336%else 337 paddw m0, [pw_8] 338 psraw m0, 4 339 punpcklbw m0, m0 340 pshuflw m0, m0, q0000 341 punpcklqdq m0, m0 342%endif 343 mov cntd, 4 344.loop: 345 mova [dstq+strideq*0], m0 346 mova [dstq+strideq*1], m0 347 mova [dstq+strideq*2], m0 348 mova [dstq+stride3q ], m0 349 lea dstq, [dstq+strideq*4] 350 dec cntd 351 jg .loop 352 RET 353 354cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a 355 mova m0, [%2q] 356 mova m1, [%2q+16] 357 DEFINE_ARGS dst, stride, stride3, cnt 358 lea stride3q, [strideq*3] 359 pxor m2, m2 360 psadbw m0, m2 361 psadbw m1, m2 362 paddw m0, m1 363 movhlps m1, m0 364 paddw m0, m1 365%if cpuflag(ssse3) 366 pmulhrsw m0, [pw_1024] 367 pshufb m0, m2 368%else 369 paddw m0, [pw_16] 370 psraw m0, 5 371 punpcklbw m0, m0 372 pshuflw m0, m0, q0000 373 punpcklqdq m0, m0 374%endif 375 mov cntd, 8 376.loop: 377 mova [dstq+strideq*0+ 0], m0 378 mova [dstq+strideq*0+16], m0 379 mova [dstq+strideq*1+ 0], m0 380 mova [dstq+strideq*1+16], m0 381 mova [dstq+strideq*2+ 0], m0 382 mova [dstq+strideq*2+16], m0 383 mova [dstq+stride3q + 0], m0 384 mova [dstq+stride3q +16], m0 385 lea dstq, [dstq+strideq*4] 386 dec cntd 387 jg .loop 388 RET 389%endmacro 390 391INIT_XMM sse2 392DC_1D_16to32_FUNCS top, a 393DC_1D_16to32_FUNCS left, l 394INIT_XMM ssse3 395DC_1D_16to32_FUNCS top, a 396DC_1D_16to32_FUNCS left, l 397 398%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l) 399%if HAVE_AVX2_EXTERNAL 400cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a 401 mova m0, [%2q] 402 DEFINE_ARGS dst, stride, stride3, cnt 403 lea stride3q, [strideq*3] 404 pxor m2, m2 405 psadbw m0, m2 406 vextracti128 xm1, m0, 1 407 paddw xm0, xm1 408 movhlps xm1, xm0 409 paddw xm0, xm1 410 pmulhrsw xm0, [pw_1024] 411 vpbroadcastb m0, xm0 412 mov cntd, 4 413.loop: 414 mova [dstq+strideq*0], m0 415 mova [dstq+strideq*1], m0 416 mova [dstq+strideq*2], m0 417 mova [dstq+stride3q ], m0 418 lea dstq, [dstq+strideq*4] 419 mova [dstq+strideq*0], m0 420 mova [dstq+strideq*1], m0 421 mova [dstq+strideq*2], m0 422 mova [dstq+stride3q ], m0 423 lea dstq, [dstq+strideq*4] 424 dec cntd 425 jg .loop 426 RET 427%endif 428%endmacro 429 430INIT_YMM avx2 431DC_1D_AVX2_FUNCS top, a 432DC_1D_AVX2_FUNCS left, l 433 434; v 435 436INIT_MMX mmx 437cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a 438 movq m0, [aq] 439 DEFINE_ARGS dst, stride, stride3 440 lea stride3q, [strideq*3] 441 movq [dstq+strideq*0], m0 442 movq [dstq+strideq*1], m0 443 movq [dstq+strideq*2], m0 444 movq [dstq+stride3q ], m0 445 lea dstq, [dstq+strideq*4] 446 movq [dstq+strideq*0], m0 447 movq [dstq+strideq*1], m0 448 movq [dstq+strideq*2], m0 449 movq [dstq+stride3q ], m0 450 RET 451 452INIT_XMM sse 453cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a 454 mova m0, [aq] 455 DEFINE_ARGS dst, stride, stride3, cnt 456 lea stride3q, [strideq*3] 457 mov cntd, 4 458.loop: 459 mova [dstq+strideq*0], m0 460 mova [dstq+strideq*1], m0 461 mova [dstq+strideq*2], m0 462 mova [dstq+stride3q ], m0 463 lea dstq, [dstq+strideq*4] 464 dec cntd 465 jg .loop 466 RET 467 468INIT_XMM sse 469cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a 470 mova m0, [aq] 471 mova m1, [aq+16] 472 DEFINE_ARGS dst, stride, stride3, cnt 473 lea stride3q, [strideq*3] 474 mov cntd, 8 475.loop: 476 mova [dstq+strideq*0+ 0], m0 477 mova [dstq+strideq*0+16], m1 478 mova [dstq+strideq*1+ 0], m0 479 mova [dstq+strideq*1+16], m1 480 mova [dstq+strideq*2+ 0], m0 481 mova [dstq+strideq*2+16], m1 482 mova [dstq+stride3q + 0], m0 483 mova [dstq+stride3q +16], m1 484 lea dstq, [dstq+strideq*4] 485 dec cntd 486 jg .loop 487 RET 488 489INIT_YMM avx 490cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a 491 mova m0, [aq] 492 DEFINE_ARGS dst, stride, stride3, cnt 493 lea stride3q, [strideq*3] 494 mov cntd, 4 495.loop: 496 mova [dstq+strideq*0], m0 497 mova [dstq+strideq*1], m0 498 mova [dstq+strideq*2], m0 499 mova [dstq+stride3q ], m0 500 lea dstq, [dstq+strideq*4] 501 mova [dstq+strideq*0], m0 502 mova [dstq+strideq*1], m0 503 mova [dstq+strideq*2], m0 504 mova [dstq+stride3q ], m0 505 lea dstq, [dstq+strideq*4] 506 dec cntd 507 jg .loop 508 RET 509 510; h 511 512%macro H_XMM_FUNCS 2 513%if notcpuflag(avx) 514cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3 515 movd m0, [lq] 516%if cpuflag(ssse3) 517 pshufb m0, [pb_4x3_4x2_4x1_4x0] 518%else 519 punpcklbw m0, m0 520 pshuflw m0, m0, q0123 521 punpcklwd m0, m0 522%endif 523 lea stride3q, [strideq*3] 524 movd [dstq+strideq*0], m0 525 psrldq m0, 4 526 movd [dstq+strideq*1], m0 527 psrldq m0, 4 528 movd [dstq+strideq*2], m0 529 psrldq m0, 4 530 movd [dstq+stride3q ], m0 531 RET 532%endif 533 534cglobal vp9_ipred_h_8x8, 3, 5, %1, dst, stride, l, stride3, cnt 535%if cpuflag(ssse3) 536 mova m2, [pb_8x1_8x0] 537 mova m3, [pb_8x3_8x2] 538%endif 539 lea stride3q, [strideq*3] 540 mov cntq, 1 541.loop: 542 movd m0, [lq+cntq*4] 543%if cpuflag(ssse3) 544 pshufb m1, m0, m3 545 pshufb m0, m2 546%else 547 punpcklbw m0, m0 548 punpcklwd m0, m0 549 pshufd m1, m0, q2233 550 pshufd m0, m0, q0011 551%endif 552 movq [dstq+strideq*0], m1 553 movhps [dstq+strideq*1], m1 554 movq [dstq+strideq*2], m0 555 movhps [dstq+stride3q ], m0 556 lea dstq, [dstq+strideq*4] 557 dec cntq 558 jge .loop 559 RET 560 561cglobal vp9_ipred_h_16x16, 3, 5, %2, dst, stride, l, stride3, cnt 562%if cpuflag(ssse3) 563 mova m5, [pb_1] 564 mova m6, [pb_2] 565 mova m7, [pb_3] 566 pxor m4, m4 567%endif 568 lea stride3q, [strideq*3] 569 mov cntq, 3 570.loop: 571 movd m3, [lq+cntq*4] 572%if cpuflag(ssse3) 573 pshufb m0, m3, m7 574 pshufb m1, m3, m6 575%else 576 punpcklbw m3, m3 577 punpcklwd m3, m3 578 pshufd m0, m3, q3333 579 pshufd m1, m3, q2222 580%endif 581 mova [dstq+strideq*0], m0 582 mova [dstq+strideq*1], m1 583%if cpuflag(ssse3) 584 pshufb m2, m3, m5 585 pshufb m3, m4 586%else 587 pshufd m2, m3, q1111 588 pshufd m3, m3, q0000 589%endif 590 mova [dstq+strideq*2], m2 591 mova [dstq+stride3q ], m3 592 lea dstq, [dstq+strideq*4] 593 dec cntq 594 jge .loop 595 RET 596 597cglobal vp9_ipred_h_32x32, 3, 5, %2, dst, stride, l, stride3, cnt 598%if cpuflag(ssse3) 599 mova m5, [pb_1] 600 mova m6, [pb_2] 601 mova m7, [pb_3] 602 pxor m4, m4 603%endif 604 lea stride3q, [strideq*3] 605 mov cntq, 7 606.loop: 607 movd m3, [lq+cntq*4] 608%if cpuflag(ssse3) 609 pshufb m0, m3, m7 610 pshufb m1, m3, m6 611%else 612 punpcklbw m3, m3 613 punpcklwd m3, m3 614 pshufd m0, m3, q3333 615 pshufd m1, m3, q2222 616%endif 617 mova [dstq+strideq*0+ 0], m0 618 mova [dstq+strideq*0+16], m0 619 mova [dstq+strideq*1+ 0], m1 620 mova [dstq+strideq*1+16], m1 621%if cpuflag(ssse3) 622 pshufb m2, m3, m5 623 pshufb m3, m4 624%else 625 pshufd m2, m3, q1111 626 pshufd m3, m3, q0000 627%endif 628 mova [dstq+strideq*2+ 0], m2 629 mova [dstq+strideq*2+16], m2 630 mova [dstq+stride3q + 0], m3 631 mova [dstq+stride3q +16], m3 632 lea dstq, [dstq+strideq*4] 633 dec cntq 634 jge .loop 635 RET 636%endmacro 637 638INIT_XMM sse2 639H_XMM_FUNCS 2, 4 640INIT_XMM ssse3 641H_XMM_FUNCS 4, 8 642INIT_XMM avx 643H_XMM_FUNCS 4, 8 644 645%if HAVE_AVX2_EXTERNAL 646INIT_YMM avx2 647cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt 648 mova m5, [pb_1] 649 mova m6, [pb_2] 650 mova m7, [pb_3] 651 pxor m4, m4 652 lea stride3q, [strideq*3] 653 mov cntq, 7 654.loop: 655 movd xm3, [lq+cntq*4] 656 vinserti128 m3, m3, xm3, 1 657 pshufb m0, m3, m7 658 pshufb m1, m3, m6 659 mova [dstq+strideq*0], m0 660 mova [dstq+strideq*1], m1 661 pshufb m2, m3, m5 662 pshufb m3, m4 663 mova [dstq+strideq*2], m2 664 mova [dstq+stride3q ], m3 665 lea dstq, [dstq+strideq*4] 666 dec cntq 667 jge .loop 668 RET 669%endif 670 671; tm 672 673%macro TM_MMX_FUNCS 0 674cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a 675 pxor m1, m1 676 movd m0, [aq] 677 pinsrw m2, [aq-1], 0 678 punpcklbw m0, m1 679 DEFINE_ARGS dst, stride, l, cnt 680%if cpuflag(ssse3) 681 mova m3, [pw_m256] 682 mova m1, [pw_m255] 683 pshufb m2, m3 684%else 685 punpcklbw m2, m1 686 pshufw m2, m2, q0000 687%endif 688 psubw m0, m2 689 mov cntq, 1 690.loop: 691 pinsrw m2, [lq+cntq*2], 0 692%if cpuflag(ssse3) 693 pshufb m4, m2, m1 694 pshufb m2, m3 695%else 696 punpcklbw m2, m1 697 pshufw m4, m2, q1111 698 pshufw m2, m2, q0000 699%endif 700 paddw m4, m0 701 paddw m2, m0 702 packuswb m4, m4 703 packuswb m2, m2 704 movd [dstq+strideq*0], m4 705 movd [dstq+strideq*1], m2 706 lea dstq, [dstq+strideq*2] 707 dec cntq 708 jge .loop 709 RET 710%endmacro 711 712INIT_MMX mmxext 713TM_MMX_FUNCS 714INIT_MMX ssse3 715TM_MMX_FUNCS 716 717%macro TM_XMM_FUNCS 0 718cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a 719 pxor m1, m1 720 movh m0, [aq] 721 pinsrw m2, [aq-1], 0 722 punpcklbw m0, m1 723 DEFINE_ARGS dst, stride, l, cnt 724%if cpuflag(ssse3) 725 mova m3, [pw_m256] 726 mova m1, [pw_m255] 727 pshufb m2, m3 728%else 729 punpcklbw m2, m1 730 punpcklwd m2, m2 731 pshufd m2, m2, q0000 732%endif 733 psubw m0, m2 734 mov cntq, 3 735.loop: 736 pinsrw m2, [lq+cntq*2], 0 737%if cpuflag(ssse3) 738 pshufb m4, m2, m1 739 pshufb m2, m3 740%else 741 punpcklbw m2, m1 742 punpcklwd m2, m2 743 pshufd m4, m2, q1111 744 pshufd m2, m2, q0000 745%endif 746 paddw m4, m0 747 paddw m2, m0 748 packuswb m4, m2 749 movh [dstq+strideq*0], m4 750 movhps [dstq+strideq*1], m4 751 lea dstq, [dstq+strideq*2] 752 dec cntq 753 jge .loop 754 RET 755 756cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a 757 pxor m3, m3 758 mova m0, [aq] 759 pinsrw m2, [aq-1], 0 760 punpckhbw m1, m0, m3 761 punpcklbw m0, m3 762 DEFINE_ARGS dst, stride, l, cnt 763%if cpuflag(ssse3) 764 mova m4, [pw_m256] 765 mova m3, [pw_m255] 766 pshufb m2, m4 767%else 768 punpcklbw m2, m3 769 punpcklwd m2, m2 770 pshufd m2, m2, q0000 771%endif 772 psubw m1, m2 773 psubw m0, m2 774 mov cntq, 7 775.loop: 776 pinsrw m7, [lq+cntq*2], 0 777%if cpuflag(ssse3) 778 pshufb m5, m7, m3 779 pshufb m7, m4 780%else 781 punpcklbw m7, m3 782 punpcklwd m7, m7 783 pshufd m5, m7, q1111 784 pshufd m7, m7, q0000 785%endif 786 paddw m2, m5, m0 787 paddw m5, m1 788 paddw m6, m7, m0 789 paddw m7, m1 790 packuswb m2, m5 791 packuswb m6, m7 792 mova [dstq+strideq*0], m2 793 mova [dstq+strideq*1], m6 794 lea dstq, [dstq+strideq*2] 795 dec cntq 796 jge .loop 797 RET 798 799%if ARCH_X86_64 800%define mem 0 801%else 802%define mem 64 803%endif 804cglobal vp9_ipred_tm_32x32, 4, 4, 14, mem, dst, stride, l, a 805 pxor m5, m5 806 pinsrw m4, [aq-1], 0 807 mova m0, [aq] 808 mova m2, [aq+16] 809 DEFINE_ARGS dst, stride, l, cnt 810%if cpuflag(ssse3) 811%if ARCH_X86_64 812 mova m12, [pw_m256] 813 mova m13, [pw_m255] 814%define pw_m256_reg m12 815%define pw_m255_reg m13 816%else 817%define pw_m256_reg [pw_m256] 818%define pw_m255_reg [pw_m255] 819%endif 820 pshufb m4, pw_m256_reg 821%else 822 punpcklbw m4, m5 823 punpcklwd m4, m4 824 pshufd m4, m4, q0000 825%endif 826 punpckhbw m1, m0, m5 827 punpckhbw m3, m2, m5 828 punpcklbw m0, m5 829 punpcklbw m2, m5 830 psubw m1, m4 831 psubw m0, m4 832 psubw m3, m4 833 psubw m2, m4 834%if ARCH_X86_64 835 SWAP 0, 8 836 SWAP 1, 9 837 SWAP 2, 10 838 SWAP 3, 11 839%else 840 mova [rsp+0*16], m0 841 mova [rsp+1*16], m1 842 mova [rsp+2*16], m2 843 mova [rsp+3*16], m3 844%endif 845 mov cntq, 15 846.loop: 847 pinsrw m3, [lq+cntq*2], 0 848%if cpuflag(ssse3) 849 pshufb m7, m3, pw_m255_reg 850 pshufb m3, pw_m256_reg 851%else 852 pxor m7, m7 853 punpcklbw m3, m7 854 punpcklwd m3, m3 855 pshufd m7, m3, q1111 856 pshufd m3, m3, q0000 857%endif 858%if ARCH_X86_64 859 paddw m4, m7, m8 860 paddw m5, m7, m9 861 paddw m6, m7, m10 862 paddw m7, m11 863 paddw m0, m3, m8 864 paddw m1, m3, m9 865 paddw m2, m3, m10 866 paddw m3, m11 867%else 868 paddw m4, m7, [rsp+0*16] 869 paddw m5, m7, [rsp+1*16] 870 paddw m6, m7, [rsp+2*16] 871 paddw m7, [rsp+3*16] 872 paddw m0, m3, [rsp+0*16] 873 paddw m1, m3, [rsp+1*16] 874 paddw m2, m3, [rsp+2*16] 875 paddw m3, [rsp+3*16] 876%endif 877 packuswb m4, m5 878 packuswb m6, m7 879 packuswb m0, m1 880 packuswb m2, m3 881 mova [dstq+strideq*0+ 0], m4 882 mova [dstq+strideq*0+16], m6 883 mova [dstq+strideq*1+ 0], m0 884 mova [dstq+strideq*1+16], m2 885 lea dstq, [dstq+strideq*2] 886 dec cntq 887 jge .loop 888 RET 889%undef pw_m256_reg 890%undef pw_m255_reg 891%undef mem 892%endmacro 893 894INIT_XMM sse2 895TM_XMM_FUNCS 896INIT_XMM ssse3 897TM_XMM_FUNCS 898INIT_XMM avx 899TM_XMM_FUNCS 900 901%if HAVE_AVX2_EXTERNAL 902INIT_YMM avx2 903cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a 904 pxor m3, m3 905 pinsrw xm2, [aq-1], 0 906 vinserti128 m2, m2, xm2, 1 907 mova m0, [aq] 908 DEFINE_ARGS dst, stride, l, cnt 909 mova m4, [pw_m256] 910 mova m5, [pw_m255] 911 pshufb m2, m4 912 punpckhbw m1, m0, m3 913 punpcklbw m0, m3 914 psubw m1, m2 915 psubw m0, m2 916 mov cntq, 15 917.loop: 918 pinsrw xm7, [lq+cntq*2], 0 919 vinserti128 m7, m7, xm7, 1 920 pshufb m3, m7, m5 921 pshufb m7, m4 922 paddw m2, m3, m0 923 paddw m3, m1 924 paddw m6, m7, m0 925 paddw m7, m1 926 packuswb m2, m3 927 packuswb m6, m7 928 mova [dstq+strideq*0], m2 929 mova [dstq+strideq*1], m6 930 lea dstq, [dstq+strideq*2] 931 dec cntq 932 jge .loop 933 RET 934%endif 935 936; dl 937 938%macro LOWPASS 4 ; left [dst], center, right, tmp 939 pxor m%4, m%1, m%3 940 pand m%4, [pb_1] 941 pavgb m%1, m%3 942 psubusb m%1, m%4 943 pavgb m%1, m%2 944%endmacro 945 946%macro DL_MMX_FUNCS 0 947cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a 948 movq m1, [aq] 949%if cpuflag(ssse3) 950 pshufb m0, m1, [pb_0to5_2x7] 951 pshufb m2, m1, [pb_2to6_3x7] 952%else 953 punpckhbw m3, m1, m1 ; 44556677 954 pand m0, m1, [pb_6xm1_2x0] ; 012345__ 955 pand m3, [pb_6x0_2xm1] ; ______77 956 psrlq m2, m1, 16 ; 234567__ 957 por m0, m3 ; 01234577 958 por m2, m3 ; 23456777 959%endif 960 psrlq m1, 8 961 LOWPASS 0, 1, 2, 3 962 963 pshufw m1, m0, q3321 964 movd [dstq+strideq*0], m0 965 movd [dstq+strideq*2], m1 966 psrlq m0, 8 967 psrlq m1, 8 968 add dstq, strideq 969 movd [dstq+strideq*0], m0 970 movd [dstq+strideq*2], m1 971 RET 972%endmacro 973 974INIT_MMX mmxext 975DL_MMX_FUNCS 976INIT_MMX ssse3 977DL_MMX_FUNCS 978 979%macro DL_XMM_FUNCS 0 980cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a 981 movq m0, [aq] 982 lea stride5q, [strideq*5] 983%if cpuflag(ssse3) 984 pshufb m1, m0, [pb_1to6_10x7] 985%else 986 punpcklbw m1, m0, m0 ; 0011223344556677 987 punpckhwd m1, m1 ; 4x4,4x5,4x6,4x7 988%endif 989 shufps m0, m1, q3310 990%if notcpuflag(ssse3) 991 psrldq m1, m0, 1 992 shufps m1, m0, q3210 993%endif 994 psrldq m2, m1, 1 995 LOWPASS 0, 1, 2, 3 996 997 pshufd m1, m0, q3321 998 movq [dstq+strideq*0], m0 999 movq [dstq+strideq*4], m1 1000 psrldq m0, 1 1001 psrldq m1, 1 1002 movq [dstq+strideq*1], m0 1003 movq [dstq+stride5q ], m1 1004 lea dstq, [dstq+strideq*2] 1005 psrldq m0, 1 1006 psrldq m1, 1 1007 movq [dstq+strideq*0], m0 1008 movq [dstq+strideq*4], m1 1009 psrldq m0, 1 1010 psrldq m1, 1 1011 movq [dstq+strideq*1], m0 1012 movq [dstq+stride5q ], m1 1013 RET 1014 1015cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a 1016 mova m0, [aq] 1017%if cpuflag(ssse3) 1018 mova m5, [pb_1toE_2xF] 1019 pshufb m1, m0, m5 1020 pshufb m2, m1, m5 1021 pshufb m4, m0, [pb_15] 1022%else 1023 pand m5, m0, [pb_15x0_1xm1] ; _______________F 1024 psrldq m1, m0, 1 ; 123456789ABCDEF_ 1025 por m1, m5 ; 123456789ABCDEFF 1026 psrldq m2, m1, 1 ; 23456789ABCDEFF_ 1027 por m2, m5 ; 23456789ABCDEFFF 1028 pshufhw m4, m1, q3333 ; xxxxxxxxFFFFFFFF 1029%endif 1030 LOWPASS 0, 1, 2, 3 1031 DEFINE_ARGS dst, stride, cnt, stride9 1032 lea stride9q, [strideq+strideq*8] 1033 mov cntd, 4 1034 1035.loop: 1036 movhlps m4, m0 1037 mova [dstq+strideq*0], m0 1038%if cpuflag(ssse3) 1039 pshufb m0, m5 1040%else 1041 psrldq m0, 1 1042 por m0, m5 1043%endif 1044 mova [dstq+strideq*8], m4 1045 movhlps m4, m0 1046 mova [dstq+strideq*1], m0 1047%if cpuflag(ssse3) 1048 pshufb m0, m5 1049%else 1050 psrldq m0, 1 1051 por m0, m5 1052%endif 1053 mova [dstq+stride9q ], m4 1054 lea dstq, [dstq+strideq*2] 1055 dec cntd 1056 jg .loop 1057 RET 1058 1059cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16 1060 mova m0, [aq] 1061 mova m1, [aq+16] 1062 PALIGNR m2, m1, m0, 1, m4 1063 PALIGNR m3, m1, m0, 2, m4 1064 LOWPASS 0, 2, 3, 4 1065%if cpuflag(ssse3) 1066 mova m5, [pb_1toE_2xF] 1067 pshufb m2, m1, m5 1068 pshufb m3, m2, m5 1069 pshufb m6, m1, [pb_15] 1070 mova m7, m6 1071%else 1072 pand m5, m1, [pb_15x0_1xm1] ; _______________F 1073 psrldq m2, m1, 1 ; 123456789ABCDEF_ 1074 por m2, m5 ; 123456789ABCDEFF 1075 psrldq m3, m2, 1 ; 23456789ABCDEFF_ 1076 por m3, m5 ; 23456789ABCDEFFF 1077 pshufhw m7, m2, q3333 ; xxxxxxxxFFFFFFFF 1078 pshufd m6, m7, q3333 1079%endif 1080 LOWPASS 1, 2, 3, 4 1081 lea dst16q, [dstq +strideq*8] 1082 mov cntd, 8 1083 lea dst16q, [dst16q+strideq*8] 1084.loop: 1085 movhlps m7, m1 1086 mova [dstq +strideq*0+ 0], m0 1087 mova [dstq +strideq*0+16], m1 1088 movhps [dstq+strideq*8+ 0], m0 1089 movq [dstq +strideq*8+ 8], m1 1090 mova [dstq +strideq*8+16], m7 1091 mova [dst16q+strideq*0+ 0], m1 1092 mova [dst16q+strideq*0+16], m6 1093 mova [dst16q+strideq*8+ 0], m7 1094 mova [dst16q+strideq*8+16], m6 1095%if cpuflag(avx) 1096 vpalignr m0, m1, m0, 1 1097 pshufb m1, m5 1098%elif cpuflag(ssse3) 1099 palignr m2, m1, m0, 1 1100 pshufb m1, m5 1101 mova m0, m2 1102%else 1103 mova m4, m1 1104 psrldq m0, 1 1105 pslldq m4, 15 1106 psrldq m1, 1 1107 por m0, m4 1108 por m1, m5 1109%endif 1110 add dstq, strideq 1111 add dst16q, strideq 1112 dec cntd 1113 jg .loop 1114 RET 1115%endmacro 1116 1117INIT_XMM sse2 1118DL_XMM_FUNCS 1119INIT_XMM ssse3 1120DL_XMM_FUNCS 1121INIT_XMM avx 1122DL_XMM_FUNCS 1123 1124; dr 1125 1126%macro DR_MMX_FUNCS 0 1127cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a 1128 movd m0, [lq] 1129 punpckldq m0, [aq-1] 1130 movd m1, [aq+3] 1131 DEFINE_ARGS dst, stride, stride3 1132 lea stride3q, [strideq*3] 1133 PALIGNR m1, m0, 1, m3 1134 psrlq m2, m1, 8 1135 LOWPASS 0, 1, 2, 3 1136 1137 movd [dstq+stride3q ], m0 1138 psrlq m0, 8 1139 movd [dstq+strideq*2], m0 1140 psrlq m0, 8 1141 movd [dstq+strideq*1], m0 1142 psrlq m0, 8 1143 movd [dstq+strideq*0], m0 1144 RET 1145%endmacro 1146 1147INIT_MMX mmxext 1148DR_MMX_FUNCS 1149INIT_MMX ssse3 1150DR_MMX_FUNCS 1151 1152%macro DR_XMM_FUNCS 0 1153cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a 1154 movq m1, [lq] 1155 movhps m1, [aq-1] 1156 movd m2, [aq+7] 1157 DEFINE_ARGS dst, stride, stride3 1158 lea stride3q, [strideq*3] 1159 pslldq m0, m1, 1 1160 PALIGNR m2, m1, 1, m3 1161 LOWPASS 0, 1, 2, 3 1162 1163 movhps [dstq+strideq*0], m0 1164 pslldq m0, 1 1165 movhps [dstq+strideq*1], m0 1166 pslldq m0, 1 1167 movhps [dstq+strideq*2], m0 1168 pslldq m0, 1 1169 movhps [dstq+stride3q ], m0 1170 pslldq m0, 1 1171 lea dstq, [dstq+strideq*4] 1172 movhps [dstq+strideq*0], m0 1173 pslldq m0, 1 1174 movhps [dstq+strideq*1], m0 1175 pslldq m0, 1 1176 movhps [dstq+strideq*2], m0 1177 pslldq m0, 1 1178 movhps [dstq+stride3q ], m0 1179 RET 1180 1181cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a 1182 mova m1, [lq] 1183 movu m2, [aq-1] 1184 movd m4, [aq+15] 1185 DEFINE_ARGS dst, stride, stride9, cnt 1186 lea stride9q, [strideq *3] 1187 mov cntd, 4 1188 lea stride9q, [stride9q*3] 1189 PALIGNR m4, m2, 1, m5 1190 PALIGNR m3, m2, m1, 15, m5 1191 LOWPASS 3, 2, 4, 5 1192 pslldq m0, m1, 1 1193 PALIGNR m2, m1, 1, m4 1194 LOWPASS 0, 1, 2, 4 1195 1196.loop: 1197 mova [dstq+strideq*0 ], m3 1198 movhps [dstq+strideq*8+0], m0 1199 movq [dstq+strideq*8+8], m3 1200 PALIGNR m3, m0, 15, m1 1201 pslldq m0, 1 1202 mova [dstq+strideq*1 ], m3 1203 movhps [dstq+stride9q +0], m0 1204 movq [dstq+stride9q +8], m3 1205 PALIGNR m3, m0, 15, m1 1206 pslldq m0, 1 1207 lea dstq, [dstq+strideq*2] 1208 dec cntd 1209 jg .loop 1210 RET 1211 1212cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a 1213 mova m1, [lq] 1214 mova m2, [lq+16] 1215 movu m3, [aq-1] 1216 movu m4, [aq+15] 1217 movd m5, [aq+31] 1218 DEFINE_ARGS dst, stride, stride8, cnt 1219 lea stride8q, [strideq*8] 1220 PALIGNR m5, m4, 1, m7 1221 PALIGNR m6, m4, m3, 15, m7 1222 LOWPASS 5, 4, 6, 7 1223 PALIGNR m4, m3, 1, m7 1224 PALIGNR m6, m3, m2, 15, m7 1225 LOWPASS 4, 3, 6, 7 1226 PALIGNR m3, m2, 1, m7 1227 PALIGNR m6, m2, m1, 15, m7 1228 LOWPASS 3, 2, 6, 7 1229 PALIGNR m2, m1, 1, m6 1230 pslldq m0, m1, 1 1231 LOWPASS 2, 1, 0, 6 1232 mov cntd, 16 1233 1234 ; out=m2/m3/m4/m5 1235.loop: 1236 mova [dstq+stride8q*0+ 0], m4 1237 mova [dstq+stride8q*0+16], m5 1238 mova [dstq+stride8q*2+ 0], m3 1239 mova [dstq+stride8q*2+16], m4 1240 PALIGNR m5, m4, 15, m6 1241 PALIGNR m4, m3, 15, m6 1242 PALIGNR m3, m2, 15, m6 1243 pslldq m2, 1 1244 add dstq, strideq 1245 dec cntd 1246 jg .loop 1247 RET 1248%endmacro 1249 1250INIT_XMM sse2 1251DR_XMM_FUNCS 1252INIT_XMM ssse3 1253DR_XMM_FUNCS 1254INIT_XMM avx 1255DR_XMM_FUNCS 1256 1257; vl 1258 1259INIT_MMX mmxext 1260cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a 1261 movq m0, [aq] 1262 psrlq m1, m0, 8 1263 psrlq m2, m1, 8 1264 LOWPASS 2, 1, 0, 3 1265 pavgb m1, m0 1266 movd [dstq+strideq*0], m1 1267 movd [dstq+strideq*1], m2 1268 lea dstq, [dstq+strideq*2] 1269 psrlq m1, 8 1270 psrlq m2, 8 1271 movd [dstq+strideq*0], m1 1272 movd [dstq+strideq*1], m2 1273 RET 1274 1275%macro VL_XMM_FUNCS 0 1276cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a 1277 movq m0, [aq] 1278%if cpuflag(ssse3) 1279 pshufb m0, [pb_0to6_9x7] 1280%else 1281 punpcklbw m1, m0, m0 1282 punpckhwd m1, m1 1283 shufps m0, m1, q3310 1284%endif 1285 DEFINE_ARGS dst, stride, stride3 1286 lea stride3q, [strideq*3] 1287 psrldq m1, m0, 1 1288 psrldq m2, m0, 2 1289 LOWPASS 2, 1, 0, 3 1290 pavgb m1, m0 1291 1292 movq [dstq+strideq*0], m1 1293 movq [dstq+strideq*1], m2 1294 psrldq m1, 1 1295 psrldq m2, 1 1296 movq [dstq+strideq*2], m1 1297 movq [dstq+stride3q ], m2 1298 lea dstq, [dstq+strideq*4] 1299 psrldq m1, 1 1300 psrldq m2, 1 1301 movq [dstq+strideq*0], m1 1302 movq [dstq+strideq*1], m2 1303 psrldq m1, 1 1304 psrldq m2, 1 1305 movq [dstq+strideq*2], m1 1306 movq [dstq+stride3q ], m2 1307 RET 1308 1309cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a 1310 mova m0, [aq] 1311 DEFINE_ARGS dst, stride, stride3, cnt 1312 lea stride3q, [strideq*3] 1313%if cpuflag(ssse3) 1314 mova m4, [pb_1toE_2xF] 1315 pshufb m1, m0, m4 1316 pshufb m2, m1, m4 1317%else 1318 pand m4, m0, [pb_15x0_1xm1] ; _______________F 1319 psrldq m1, m0, 1 ; 123456789ABCDEF_ 1320 por m1, m4 ; 123456789ABCDEFF 1321 psrldq m2, m1, 1 ; 23456789ABCDEFF_ 1322 por m2, m4 ; 23456789ABCDEFFF 1323%endif 1324 LOWPASS 2, 1, 0, 3 1325 pavgb m1, m0 1326 mov cntd, 4 1327.loop: 1328 mova [dstq+strideq*0], m1 1329 mova [dstq+strideq*1], m2 1330%if cpuflag(ssse3) 1331 pshufb m1, m4 1332 pshufb m2, m4 1333%else 1334 psrldq m1, 1 1335 psrldq m2, 1 1336 por m1, m4 1337 por m2, m4 1338%endif 1339 mova [dstq+strideq*2], m1 1340 mova [dstq+stride3q ], m2 1341%if cpuflag(ssse3) 1342 pshufb m1, m4 1343 pshufb m2, m4 1344%else 1345 psrldq m1, 1 1346 psrldq m2, 1 1347 por m1, m4 1348 por m2, m4 1349%endif 1350 lea dstq, [dstq+strideq*4] 1351 dec cntd 1352 jg .loop 1353 RET 1354 1355cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a 1356 mova m0, [aq] 1357 mova m5, [aq+16] 1358 DEFINE_ARGS dst, stride, dst16, cnt 1359 PALIGNR m2, m5, m0, 1, m4 1360 PALIGNR m3, m5, m0, 2, m4 1361 lea dst16q, [dstq +strideq*8] 1362 LOWPASS 3, 2, 0, 6 1363 pavgb m2, m0 1364%if cpuflag(ssse3) 1365 mova m4, [pb_1toE_2xF] 1366 pshufb m0, m5, m4 1367 pshufb m1, m0, m4 1368%else 1369 pand m4, m5, [pb_15x0_1xm1] ; _______________F 1370 psrldq m0, m5, 1 ; 123456789ABCDEF_ 1371 por m0, m4 ; 123456789ABCDEFF 1372 psrldq m1, m0, 1 ; 23456789ABCDEFF_ 1373 por m1, m4 ; 23456789ABCDEFFF 1374%endif 1375 lea dst16q, [dst16q+strideq*8] 1376 LOWPASS 1, 0, 5, 6 1377 pavgb m0, m5 1378%if cpuflag(ssse3) 1379 pshufb m5, [pb_15] 1380%else 1381 punpckhbw m5, m4, m4 1382 pshufhw m5, m5, q3333 1383 punpckhqdq m5, m5 1384%endif 1385 mov cntd, 8 1386 1387.loop: 1388%macro %%write 3 1389 mova [dstq+stride%1+ 0], %2 1390 mova [dstq+stride%1+16], %3 1391 movhps [dst16q+stride%1 ], %2 1392 movu [dst16q+stride%1+ 8], %3 1393 movq [dst16q+stride%1+24], m5 1394%if cpuflag(avx) 1395 palignr %2, %3, %2, 1 1396 pshufb %3, m4 1397%elif cpuflag(ssse3) 1398 palignr m6, %3, %2, 1 1399 pshufb %3, m4 1400 mova %2, m6 1401%else 1402 pslldq m6, %3, 15 1403 psrldq %3, 1 1404 psrldq %2, 1 1405 por %3, m4 1406 por %2, m6 1407%endif 1408%endmacro 1409 1410 %%write q*0, m2, m0 1411 %%write q*1, m3, m1 1412 lea dstq, [dstq +strideq*2] 1413 lea dst16q, [dst16q+strideq*2] 1414 dec cntd 1415 jg .loop 1416 RET 1417%endmacro 1418 1419INIT_XMM sse2 1420VL_XMM_FUNCS 1421INIT_XMM ssse3 1422VL_XMM_FUNCS 1423INIT_XMM avx 1424VL_XMM_FUNCS 1425 1426; vr 1427 1428%macro VR_MMX_FUNCS 0 1429cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a 1430 movq m1, [aq-1] 1431 punpckldq m2, [lq] 1432 movd m0, [aq] 1433 DEFINE_ARGS dst, stride, stride3 1434 lea stride3q, [strideq*3] 1435 pavgb m0, m1 1436 PALIGNR m1, m2, 5, m3 1437 psrlq m2, m1, 8 1438 psllq m3, m1, 8 1439 LOWPASS 2, 1, 3, 4 1440 1441 ; ABCD <- for the following predictor: 1442 ; EFGH 1443 ; IABC | m0 contains ABCDxxxx 1444 ; JEFG | m2 contains xJIEFGHx 1445 1446%if cpuflag(ssse3) 1447 punpckldq m0, m2 1448 pshufb m2, [pb_13456_3xm1] 1449 movd [dstq+strideq*0], m0 1450 pshufb m0, [pb_6012_4xm1] 1451 movd [dstq+stride3q ], m2 1452 psrlq m2, 8 1453 movd [dstq+strideq*2], m0 1454 movd [dstq+strideq*1], m2 1455%else 1456 psllq m1, m2, 40 1457 psrlq m2, 24 1458 movd [dstq+strideq*0], m0 1459 movd [dstq+strideq*1], m2 1460 PALIGNR m0, m1, 7, m3 1461 psllq m1, 8 1462 PALIGNR m2, m1, 7, m3 1463 movd [dstq+strideq*2], m0 1464 movd [dstq+stride3q ], m2 1465%endif 1466 RET 1467%endmacro 1468 1469INIT_MMX mmxext 1470VR_MMX_FUNCS 1471INIT_MMX ssse3 1472VR_MMX_FUNCS 1473 1474%macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16 1475cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a 1476 movu m1, [aq-1] 1477 movhps m2, [lq] 1478 movq m0, [aq] 1479 DEFINE_ARGS dst, stride, stride3 1480 lea stride3q, [strideq*3] 1481 pavgb m0, m1 1482 PALIGNR m1, m2, 9, m3 1483 pslldq m2, m1, 1 1484 pslldq m3, m1, 2 1485 LOWPASS 1, 2, 3, 4 1486 1487 ; ABCDEFGH <- for the following predictor: 1488 ; IJKLMNOP 1489 ; QABCDEFG | m0 contains ABCDEFGHxxxxxxxx 1490 ; RIJKLMNO | m1 contains xxVUTSRQIJKLMNOP 1491 ; SQABCDEF 1492 ; TRIJKLMN 1493 ; USQABCDE 1494 ; VTRIJKLM 1495 1496%if cpuflag(ssse3) 1497 punpcklqdq m0, m1 ; ABCDEFGHxxVUTSRQ 1498%endif 1499 movq [dstq+strideq*0], m0 1500 movhps [dstq+strideq*1], m1 1501%if cpuflag(ssse3) 1502 pshufb m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG 1503 pshufb m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO 1504%else 1505 psrlw m2, m1, 8 ; x_U_S_Q_xxxxxxxx 1506 pand m3, m1, [pw_255] ; x_V_T_R_xxxxxxxx 1507 packuswb m3, m2 ; xVTRxxxxxUSQxxxx 1508 pslldq m3, 4 ; xxxxxVTRxxxxxUSQ 1509 PALIGNR m0, m3, 7, m4 ; xxxxxxUSQABCDEFG 1510 psrldq m1, 8 1511 pslldq m3, 8 1512 PALIGNR m1, m3, 7, m4 ; xxxxxxVTRIJKLMNO 1513%endif 1514 movhps [dstq+strideq*2], m0 1515 movhps [dstq+stride3q ], m1 1516 lea dstq, [dstq+strideq*4] 1517 pslldq m0, 1 1518 pslldq m1, 1 1519 movhps [dstq+strideq*0], m0 1520 movhps [dstq+strideq*1], m1 1521 pslldq m0, 1 1522 pslldq m1, 1 1523 movhps [dstq+strideq*2], m0 1524 movhps [dstq+stride3q ], m1 1525 RET 1526 1527cglobal vp9_ipred_vr_16x16, 4, 4, %1, dst, stride, l, a 1528 mova m0, [aq] 1529 movu m1, [aq-1] 1530 mova m2, [lq] 1531 DEFINE_ARGS dst, stride, stride3, cnt 1532 lea stride3q, [strideq*3] 1533 PALIGNR m3, m1, m2, 15, m6 1534 LOWPASS 3, 1, 0, 4 1535 pavgb m0, m1 1536 PALIGNR m1, m2, 1, m6 1537 pslldq m4, m2, 1 1538 LOWPASS 1, 2, 4, 5 1539%if cpuflag(ssse3) 1540 pshufb m1, [pb_02468ACE_13579BDF] 1541%else 1542 psrlw m5, m1, 8 1543 pand m1, [pw_255] 1544 packuswb m1, m5 1545%endif 1546 mov cntd, 4 1547 1548.loop: 1549 movlhps m2, m1 1550 mova [dstq+strideq*0], m0 1551 mova [dstq+strideq*1], m3 1552 PALIGNR m4, m0, m1, 15, m6 1553 PALIGNR m5, m3, m2, 15, m6 1554 mova [dstq+strideq*2], m4 1555 mova [dstq+stride3q ], m5 1556 lea dstq, [dstq+strideq*4] 1557 PALIGNR m0, m1, 14, m6 1558 PALIGNR m3, m2, 14, m6 1559 pslldq m1, 2 1560 dec cntd 1561 jg .loop 1562 RET 1563 1564cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a 1565 mova m0, [aq] 1566 mova m2, [aq+16] 1567 movu m1, [aq-1] 1568 PALIGNR m3, m2, m0, 15, m6 1569 PALIGNR m4, m2, m0, 14, m6 1570 LOWPASS 4, 3, 2, 5 1571 pavgb m3, m2 1572 mova m2, [lq+16] 1573 PALIGNR m5, m1, m2, 15, m6 1574 LOWPASS 5, 1, 0, 6 1575 pavgb m0, m1 1576 mova m6, [lq] 1577%if ARCH_X86_64 1578 SWAP 0, 8 1579%else 1580 mova [dstq], m0 1581%endif 1582 PALIGNR m1, m2, 1, m0 1583 PALIGNR m7, m2, m6, 15, m0 1584 LOWPASS 1, 2, 7, 0 1585 PALIGNR m2, m6, 1, m0 1586 pslldq m7, m6, 1 1587 LOWPASS 2, 6, 7, 0 1588%if cpuflag(ssse3) 1589 pshufb m1, [pb_02468ACE_13579BDF] 1590 pshufb m2, [pb_02468ACE_13579BDF] 1591%else 1592 psrlw m0, m1, 8 1593 psrlw m6, m2, 8 1594 pand m1, [pw_255] 1595 pand m2, [pw_255] 1596 packuswb m1, m0 1597 packuswb m2, m6 1598%endif 1599 DEFINE_ARGS dst, stride, dst16, cnt 1600 lea dst16q, [dstq +strideq*8] 1601 lea dst16q, [dst16q+strideq*8] 1602 SBUTTERFLY qdq, 2, 1, 6 1603%if ARCH_X86_64 1604 SWAP 0, 8 1605%else 1606 mova m0, [dstq] 1607%endif 1608 mov cntd, 8 1609 1610.loop: 1611 ; even lines (0, 2, 4, ...): m1 | m0, m3 1612 ; odd lines (1, 3, 5, ...): m2 | m5, m4 1613%macro %%write 4 1614 mova [dstq+stride%1+ 0], %3 1615 mova [dstq+stride%1+16], %4 1616 movhps [dst16q+stride%1 ], %2 1617 movu [dst16q+stride%1+ 8], %3 1618 movq [dst16q+stride%1+24], %4 1619 PALIGNR %4, %3, 15, m6 1620 PALIGNR %3, %2, 15, m6 1621 pslldq %2, 1 1622%endmacro 1623 1624 %%write q*0, m1, m0, m3 1625 %%write q*1, m2, m5, m4 1626 lea dstq, [dstq +strideq*2] 1627 lea dst16q, [dst16q+strideq*2] 1628 dec cntd 1629 jg .loop 1630 RET 1631%endmacro 1632 1633INIT_XMM sse2 1634VR_XMM_FUNCS 7 1635INIT_XMM ssse3 1636VR_XMM_FUNCS 6 1637INIT_XMM avx 1638VR_XMM_FUNCS 6 1639 1640; hd 1641 1642INIT_MMX mmxext 1643cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a 1644 movd m0, [lq] 1645 punpckldq m0, [aq-1] 1646 DEFINE_ARGS dst, stride, stride3 1647 lea stride3q, [strideq*3] 1648 psrlq m1, m0, 8 1649 psrlq m2, m1, 8 1650 LOWPASS 2, 1, 0, 3 1651 pavgb m1, m0 1652 1653 ; DHIJ <- for the following predictor: 1654 ; CGDH 1655 ; BFCG | m1 contains ABCDxxxx 1656 ; AEBF | m2 contains EFGHIJxx 1657 1658 punpcklbw m1, m2 1659 punpckhdq m0, m1, m2 1660 1661 ; m1 contains AEBFCGDH 1662 ; m0 contains CGDHIJxx 1663 1664 movd [dstq+stride3q ], m1 1665 movd [dstq+strideq*1], m0 1666 psrlq m1, 16 1667 psrlq m0, 16 1668 movd [dstq+strideq*2], m1 1669 movd [dstq+strideq*0], m0 1670 RET 1671 1672%macro HD_XMM_FUNCS 0 1673cglobal vp9_ipred_hd_8x8, 4, 4, 5, dst, stride, l, a 1674 movq m0, [lq] 1675 movhps m0, [aq-1] 1676 DEFINE_ARGS dst, stride, stride3, dst4 1677 lea stride3q, [strideq*3] 1678 lea dst4q, [dstq+strideq*4] 1679 psrldq m1, m0, 1 1680 psrldq m2, m1, 1 1681 LOWPASS 2, 1, 0, 3 1682 pavgb m1, m0 1683 1684 ; HPQRSTUV <- for the following predictor 1685 ; GOHPQRST 1686 ; FNGOHPQR | m1 contains ABCDEFGHxxxxxxxx 1687 ; EMFNGOHP | m2 contains IJKLMNOPQRSTUVxx 1688 ; DLEMFNGO 1689 ; CKDLEMFN 1690 ; BJCKDLEM 1691 ; AIBJCKDL 1692 1693 punpcklbw m1, m2 1694 movhlps m2, m2 1695 1696 ; m1 contains AIBJCKDLEMFNGOHP 1697 ; m2 contains QRSTUVxxxxxxxxxx 1698 1699 movhps [dstq +stride3q ], m1 1700 movq [dst4q+stride3q ], m1 1701 PALIGNR m3, m2, m1, 2, m4 1702 movhps [dstq +strideq*2], m3 1703 movq [dst4q+strideq*2], m3 1704 PALIGNR m3, m2, m1, 4, m4 1705 movhps [dstq +strideq*1], m3 1706 movq [dst4q+strideq*1], m3 1707 PALIGNR m2, m1, 6, m4 1708 movhps [dstq +strideq*0], m2 1709 movq [dst4q+strideq*0], m2 1710 RET 1711 1712cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a 1713 mova m0, [lq] 1714 movu m3, [aq-1] 1715 DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12 1716 lea stride4q, [strideq*4] 1717 lea dst4q, [dstq +stride4q] 1718 lea dst8q, [dst4q+stride4q] 1719 lea dst12q, [dst8q+stride4q] 1720 psrldq m4, m3, 1 1721 psrldq m5, m3, 2 1722 LOWPASS 5, 4, 3, 6 1723 PALIGNR m1, m3, m0, 1, m6 1724 PALIGNR m2, m3, m0, 2, m6 1725 LOWPASS 2, 1, 0, 6 1726 pavgb m1, m0 1727 SBUTTERFLY bw, 1, 2, 6 1728 1729 ; I PROBABLY INVERTED L0 ad L16 here 1730 ; m1, m2, m5 1731.loop: 1732 sub stride4q, strideq 1733 movhps [dstq +stride4q +0], m2 1734 movq [dstq +stride4q +8], m5 1735 mova [dst4q+stride4q ], m2 1736 movhps [dst8q+stride4q +0], m1 1737 movq [dst8q+stride4q +8], m2 1738 mova [dst12q+stride4q ], m1 1739%if cpuflag(avx) 1740 palignr m1, m2, m1, 2 1741 palignr m2, m5, m2, 2 1742%elif cpuflag(ssse3) 1743 palignr m3, m2, m1, 2 1744 palignr m0, m5, m2, 2 1745 mova m1, m3 1746 mova m2, m0 1747%else 1748 ; slightly modified version of PALIGNR 1749 mova m6, m2 1750 mova m4, m5 1751 pslldq m6, 14 1752 pslldq m4, 14 1753 psrldq m1, 2 1754 psrldq m2, 2 1755 por m1, m6 1756 por m2, m4 1757%endif 1758 psrldq m5, 2 1759 jg .loop 1760 RET 1761 1762cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a 1763 mova m0, [lq] 1764 mova m1, [lq+16] 1765 movu m2, [aq-1] 1766 movu m3, [aq+15] 1767 DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24 1768 lea stride8q, [strideq*8] 1769 lea dst8q, [dstq +stride8q] 1770 lea dst16q, [dst8q +stride8q] 1771 lea dst24q, [dst16q+stride8q] 1772 psrldq m4, m3, 1 1773 psrldq m5, m3, 2 1774 LOWPASS 5, 4, 3, 6 1775 PALIGNR m4, m3, m2, 2, m6 1776 PALIGNR m3, m2, 1, m6 1777 LOWPASS 4, 3, 2, 6 1778 PALIGNR m3, m2, m1, 2, m6 1779 PALIGNR m2, m1, 1, m6 1780 LOWPASS 3, 2, 1, 6 1781 pavgb m2, m1 1782 PALIGNR m6, m1, m0, 1, m7 1783 PALIGNR m1, m0, 2, m7 1784 LOWPASS 1, 6, 0, 7 1785 pavgb m0, m6 1786 SBUTTERFLY bw, 2, 3, 6 1787 SBUTTERFLY bw, 0, 1, 6 1788 1789 ; m0, m1, m2, m3, m4, m5 1790.loop: 1791 sub stride8q, strideq 1792 mova [dstq +stride8q+ 0], m3 1793 mova [dstq +stride8q+16], m4 1794 mova [dst8q +stride8q+ 0], m2 1795 mova [dst8q +stride8q+16], m3 1796 mova [dst16q+stride8q+ 0], m1 1797 mova [dst16q+stride8q+16], m2 1798 mova [dst24q+stride8q+ 0], m0 1799 mova [dst24q+stride8q+16], m1 1800%if cpuflag(avx) 1801 palignr m0, m1, m0, 2 1802 palignr m1, m2, m1, 2 1803 palignr m2, m3, m2, 2 1804 palignr m3, m4, m3, 2 1805 palignr m4, m5, m4, 2 1806 psrldq m5, 2 1807%elif cpuflag(ssse3) 1808 psrldq m6, m5, 2 1809 palignr m5, m4, 2 1810 palignr m4, m3, 2 1811 palignr m3, m2, 2 1812 palignr m2, m1, 2 1813 palignr m1, m0, 2 1814 mova m0, m1 1815 mova m1, m2 1816 mova m2, m3 1817 mova m3, m4 1818 mova m4, m5 1819 mova m5, m6 1820%else 1821 ; sort of a half-integrated version of PALIGNR 1822 pslldq m7, m4, 14 1823 pslldq m6, m5, 14 1824 psrldq m4, 2 1825 psrldq m5, 2 1826 por m4, m6 1827 pslldq m6, m3, 14 1828 psrldq m3, 2 1829 por m3, m7 1830 pslldq m7, m2, 14 1831 psrldq m2, 2 1832 por m2, m6 1833 pslldq m6, m1, 14 1834 psrldq m1, 2 1835 por m1, m7 1836 psrldq m0, 2 1837 por m0, m6 1838%endif 1839 jg .loop 1840 RET 1841%endmacro 1842 1843INIT_XMM sse2 1844HD_XMM_FUNCS 1845INIT_XMM ssse3 1846HD_XMM_FUNCS 1847INIT_XMM avx 1848HD_XMM_FUNCS 1849 1850%macro HU_MMX_FUNCS 0 1851cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l 1852 movd m0, [lq] 1853%if cpuflag(ssse3) 1854 pshufb m0, [pb_0to2_5x3] 1855%else 1856 punpcklbw m1, m0, m0 ; 00112233 1857 pshufw m1, m1, q3333 ; 33333333 1858 punpckldq m0, m1 ; 01233333 1859%endif 1860 psrlq m1, m0, 8 1861 psrlq m2, m1, 8 1862 LOWPASS 2, 1, 0, 3 1863 pavgb m1, m0 1864 DEFINE_ARGS dst, stride, stride3 1865 lea stride3q, [strideq*3] 1866 SBUTTERFLY bw, 1, 2, 0 1867 PALIGNR m2, m1, 2, m0 1868 movd [dstq+strideq*0], m1 1869 movd [dstq+strideq*1], m2 1870 punpckhdq m1, m1 1871 punpckhdq m2, m2 1872 movd [dstq+strideq*2], m1 1873 movd [dstq+stride3q ], m2 1874 RET 1875%endmacro 1876 1877INIT_MMX mmxext 1878HU_MMX_FUNCS 1879INIT_MMX ssse3 1880HU_MMX_FUNCS 1881 1882%macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32 1883cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l 1884 movq m0, [lq] 1885%if cpuflag(ssse3) 1886 pshufb m0, [pb_0to6_9x7] 1887%else 1888 punpcklbw m1, m0, m0 ; 0011223344556677 1889 punpckhwd m1, m1 ; 4444555566667777 1890 shufps m0, m1, q3310 ; 0123456777777777 1891%endif 1892 psrldq m1, m0, 1 1893 psrldq m2, m1, 1 1894 LOWPASS 2, 1, 0, 3 1895 pavgb m1, m0 1896 DEFINE_ARGS dst, stride, stride3, dst4 1897 lea stride3q, [strideq*3] 1898 lea dst4q, [dstq+strideq*4] 1899 SBUTTERFLY bw, 1, 2, 0 1900 movq [dstq +strideq*0], m1 1901 movhps [dst4q+strideq*0], m1 1902 PALIGNR m0, m2, m1, 2, m3 1903 movq [dstq +strideq*1], m0 1904 movhps [dst4q+strideq*1], m0 1905 PALIGNR m0, m2, m1, 4, m3 1906 movq [dstq +strideq*2], m0 1907 movhps [dst4q+strideq*2], m0 1908 PALIGNR m2, m1, 6, m3 1909 movq [dstq +stride3q ], m2 1910 movhps [dst4q+stride3q ], m2 1911 RET 1912 1913cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l 1914 mova m0, [lq] 1915%if cpuflag(ssse3) 1916 mova m3, [pb_2toE_3xF] 1917 pshufb m1, m0, [pb_1toE_2xF] 1918 pshufb m2, m0, m3 1919%else 1920 pand m3, m0, [pb_15x0_1xm1] 1921 psrldq m1, m0, 1 1922 por m1, m3 1923 punpckhbw m3, m3 1924 psrldq m2, m0, 2 1925 por m2, m3 1926%endif 1927 LOWPASS 2, 1, 0, 4 1928 pavgb m1, m0 1929 DEFINE_ARGS dst, stride, stride9, cnt 1930 lea stride9q, [strideq*8+strideq] 1931 mov cntd, 4 1932 SBUTTERFLY bw, 1, 2, 0 1933 1934.loop: 1935 mova [dstq+strideq*0], m1 1936 mova [dstq+strideq*8], m2 1937 PALIGNR m0, m2, m1, 2, m4 1938%if cpuflag(ssse3) 1939 pshufb m2, m3 1940%else 1941 psrldq m2, 2 1942 por m2, m3 1943%endif 1944 mova [dstq+strideq*1], m0 1945 mova [dstq+stride9q ], m2 1946 PALIGNR m1, m2, m0, 2, m4 1947%if cpuflag(ssse3) 1948 pshufb m2, m3 1949%else 1950 psrldq m2, 2 1951 por m2, m3 1952%endif 1953 lea dstq, [dstq+strideq*2] 1954 dec cntd 1955 jg .loop 1956 RET 1957 1958cglobal vp9_ipred_hu_32x32, 3, 7, %1, dst, stride, l 1959 mova m1, [lq] 1960 mova m0, [lq+16] 1961 PALIGNR m2, m0, m1, 1, m5 1962 PALIGNR m3, m0, m1, 2, m5 1963 LOWPASS 3, 2, 1, 5 1964 pavgb m2, m1 1965%if cpuflag(ssse3) 1966 mova m4, [pb_2toE_3xF] 1967 pshufb m5, m0, [pb_1toE_2xF] 1968 pshufb m1, m0, m4 1969%else 1970 pand m4, m0, [pb_15x0_1xm1] 1971 psrldq m5, m0, 1 1972 por m5, m4 1973 punpckhbw m4, m4 1974 psrldq m1, m0, 2 1975 por m1, m4 1976%endif 1977 LOWPASS 1, 5, 0, 6 1978 pavgb m0, m5 1979 DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24 1980 mov cntd, 8 1981 xor stride0q, stride0q 1982 lea dst8q, [dstq +strideq*8] 1983 lea dst16q, [dst8q +strideq*8] 1984 lea dst24q, [dst16q+strideq*8] 1985 SBUTTERFLY bw, 0, 1, 5 1986 SBUTTERFLY bw, 2, 3, 5 1987%if cpuflag(ssse3) 1988 pshufb m6, m1, [pb_15] 1989%else 1990 pshufhw m6, m4, q3333 1991 punpckhqdq m6, m6 1992%endif 1993 1994.loop: 1995 mova [dstq +stride0q+ 0], m2 1996 mova [dstq +stride0q+16], m3 1997 mova [dst8q +stride0q+ 0], m3 1998 mova [dst8q +stride0q+16], m0 1999 mova [dst16q+stride0q+ 0], m0 2000 mova [dst16q+stride0q+16], m1 2001 mova [dst24q+stride0q+ 0], m1 2002 mova [dst24q+stride0q+16], m6 2003%if cpuflag(avx) 2004 palignr m2, m3, m2, 2 2005 palignr m3, m0, m3, 2 2006 palignr m0, m1, m0, 2 2007 pshufb m1, m4 2008%elif cpuflag(ssse3) 2009 pshufb m5, m1, m4 2010 palignr m1, m0, 2 2011 palignr m0, m3, 2 2012 palignr m3, m2, 2 2013 mova m2, m3 2014 mova m3, m0 2015 mova m0, m1 2016 mova m1, m5 2017%else 2018 ; half-integrated version of PALIGNR 2019 pslldq m5, m1, 14 2020 pslldq m7, m0, 14 2021 psrldq m1, 2 2022 psrldq m0, 2 2023 por m1, m4 2024 por m0, m5 2025 pslldq m5, m3, 14 2026 psrldq m3, 2 2027 por m3, m7 2028 psrldq m2, 2 2029 por m2, m5 2030%endif 2031 add stride0q, strideq 2032 dec cntd 2033 jg .loop 2034 RET 2035%endmacro 2036 2037INIT_XMM sse2 2038HU_XMM_FUNCS 8 2039INIT_XMM ssse3 2040HU_XMM_FUNCS 7 2041INIT_XMM avx 2042HU_XMM_FUNCS 7 2043 2044; FIXME 127, 128, 129 ? 2045