1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 30 31filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 32pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 33 34pb_0_1: times 4 db 0, 1 35pb_2_3: times 4 db 2, 3 36pw_1: times 4 dw 1 37pw_2: times 4 dw 2 38pw_4: times 4 dw 4 39pw_512: times 4 dw 512 40pw_2048: times 4 dw 2048 41 42%macro JMP_TABLE 3-* 43 %xdefine %1_%2_table (%%table - 2*4) 44 %xdefine %%base mangle(private_prefix %+ _%1_%2) 45 %%table: 46 %rep %0 - 2 47 dd %%base %+ .%3 - (%%table - 2*4) 48 %rotate 1 49 %endrep 50%endmacro 51 52%define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4) 53%define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4) 54%define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4) 55 56JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64 57JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ 58 s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \ 59 s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4 60JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 61JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ 62 s4-8*4, s8-8*4, s16-8*4, s32-8*4 63JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32 64JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32 65JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 66 67cextern smooth_weights_1d_16bpc 68cextern smooth_weights_2d_16bpc 69cextern filter_intra_taps 70 71SECTION .text 72 73INIT_XMM ssse3 74cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h 75 LEA r5, ipred_dc_left_16bpc_ssse3_table 76 movd m4, wm 77 tzcnt wd, wm 78 add tlq, 2 79 movifnidn hd, hm 80 pxor m3, m3 81 pavgw m4, m3 82 movd m5, wd 83 movu m0, [tlq] 84 movsxd r6, [r5+wq*4] 85 add r6, r5 86 add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table 87 movsxd wq, [r5+wq*4] 88 add wq, r5 89 jmp r6 90 91cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 92 LEA r5, ipred_dc_left_16bpc_ssse3_table 93 mov hd, hm 94 movd m4, hm 95 tzcnt r6d, hd 96 sub tlq, hq 97 tzcnt wd, wm 98 pxor m3, m3 99 sub tlq, hq 100 pavgw m4, m3 101 movd m5, r6d 102 movu m0, [tlq] 103 movsxd r6, [r5+r6*4] 104 add r6, r5 105 add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table 106 movsxd wq, [r5+wq*4] 107 add wq, r5 108 jmp r6 109.h64: 110 movu m2, [tlq+112] 111 movu m1, [tlq+ 96] 112 paddw m0, m2 113 movu m2, [tlq+ 80] 114 paddw m1, m2 115 movu m2, [tlq+ 64] 116 paddw m0, m2 117 paddw m0, m1 118.h32: 119 movu m1, [tlq+ 48] 120 movu m2, [tlq+ 32] 121 paddw m1, m2 122 paddw m0, m1 123.h16: 124 movu m1, [tlq+ 16] 125 paddw m0, m1 126.h8: 127 movhlps m1, m0 128 paddw m0, m1 129.h4: 130 punpcklwd m0, m3 131 paddd m4, m0 132 punpckhqdq m0, m0 133 paddd m0, m4 134 pshuflw m4, m0, q1032 135 paddd m0, m4 136 psrld m0, m5 137 lea stride3q, [strideq*3] 138 pshuflw m0, m0, q0000 139 punpcklqdq m0, m0 140 jmp wq 141 142cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 143 movifnidn hd, hm 144 tzcnt r6d, hd 145 lea r5d, [wq+hq] 146 movd m4, r5d 147 tzcnt r5d, r5d 148 movd m5, r5d 149 LEA r5, ipred_dc_16bpc_ssse3_table 150 tzcnt wd, wd 151 movsxd r6, [r5+r6*4] 152 movsxd wq, [r5+wq*4+5*4] 153 pxor m3, m3 154 psrlw m4, 1 155 add r6, r5 156 add wq, r5 157 lea stride3q, [strideq*3] 158 jmp r6 159.h4: 160 movq m0, [tlq-8] 161 jmp wq 162.w4: 163 movq m1, [tlq+2] 164 paddw m1, m0 165 punpckhwd m0, m3 166 punpcklwd m1, m3 167 paddd m0, m1 168 paddd m4, m0 169 punpckhqdq m0, m0 170 paddd m0, m4 171 pshuflw m1, m0, q1032 172 paddd m0, m1 173 cmp hd, 4 174 jg .w4_mul 175 psrlw m0, 3 176 jmp .w4_end 177.w4_mul: 178 mov r2d, 0xAAAB 179 mov r3d, 0x6667 180 cmp hd, 16 181 cmove r2d, r3d 182 psrld m0, 2 183 movd m1, r2d 184 pmulhuw m0, m1 185 psrlw m0, 1 186.w4_end: 187 pshuflw m0, m0, q0000 188.s4: 189 movq [dstq+strideq*0], m0 190 movq [dstq+strideq*1], m0 191 movq [dstq+strideq*2], m0 192 movq [dstq+stride3q ], m0 193 lea dstq, [dstq+strideq*4] 194 sub hd, 4 195 jg .s4 196 RET 197.h8: 198 mova m0, [tlq-16] 199 jmp wq 200.w8: 201 movu m1, [tlq+2] 202 paddw m0, m1 203 punpcklwd m1, m0, m3 204 punpckhwd m0, m3 205 paddd m0, m1 206 paddd m4, m0 207 punpckhqdq m0, m0 208 paddd m0, m4 209 pshuflw m1, m0, q1032 210 paddd m0, m1 211 psrld m0, m5 212 cmp hd, 8 213 je .w8_end 214 mov r2d, 0xAAAB 215 mov r3d, 0x6667 216 cmp hd, 32 217 cmove r2d, r3d 218 movd m1, r2d 219 pmulhuw m0, m1 220 psrlw m0, 1 221.w8_end: 222 pshuflw m0, m0, q0000 223 punpcklqdq m0, m0 224.s8: 225 mova [dstq+strideq*0], m0 226 mova [dstq+strideq*1], m0 227 mova [dstq+strideq*2], m0 228 mova [dstq+stride3q ], m0 229 lea dstq, [dstq+strideq*4] 230 sub hd, 4 231 jg .s8 232 RET 233.h16: 234 mova m0, [tlq-32] 235 paddw m0, [tlq-16] 236 jmp wq 237.w16: 238 movu m1, [tlq+ 2] 239 movu m2, [tlq+18] 240 paddw m1, m2 241 paddw m0, m1 242 punpckhwd m1, m0, m3 243 punpcklwd m0, m3 244 paddd m0, m1 245 paddd m4, m0 246 punpckhqdq m0, m0 247 paddd m0, m4 248 pshuflw m1, m0, q1032 249 paddd m0, m1 250 psrld m0, m5 251 cmp hd, 16 252 je .w16_end 253 mov r2d, 0xAAAB 254 mov r3d, 0x6667 255 test hd, 8|32 256 cmovz r2d, r3d 257 movd m1, r2d 258 pmulhuw m0, m1 259 psrlw m0, 1 260.w16_end: 261 pshuflw m0, m0, q0000 262 punpcklqdq m0, m0 263.s16c: 264 mova m1, m0 265.s16: 266 mova [dstq+strideq*0+16*0], m0 267 mova [dstq+strideq*0+16*1], m1 268 mova [dstq+strideq*1+16*0], m0 269 mova [dstq+strideq*1+16*1], m1 270 mova [dstq+strideq*2+16*0], m0 271 mova [dstq+strideq*2+16*1], m1 272 mova [dstq+stride3q +16*0], m0 273 mova [dstq+stride3q +16*1], m1 274 lea dstq, [dstq+strideq*4] 275 sub hd, 4 276 jg .s16 277 RET 278.h32: 279 mova m0, [tlq-64] 280 paddw m0, [tlq-48] 281 paddw m0, [tlq-32] 282 paddw m0, [tlq-16] 283 jmp wq 284.w32: 285 movu m1, [tlq+ 2] 286 movu m2, [tlq+18] 287 paddw m1, m2 288 movu m2, [tlq+34] 289 paddw m0, m2 290 movu m2, [tlq+50] 291 paddw m1, m2 292 paddw m0, m1 293 punpcklwd m1, m0, m3 294 punpckhwd m0, m3 295 paddd m0, m1 296 paddd m4, m0 297 punpckhqdq m0, m0 298 paddd m0, m4 299 pshuflw m1, m0, q1032 300 paddd m0, m1 301 psrld m0, m5 302 cmp hd, 32 303 je .w32_end 304 mov r2d, 0xAAAB 305 mov r3d, 0x6667 306 cmp hd, 8 307 cmove r2d, r3d 308 movd m1, r2d 309 pmulhuw m0, m1 310 psrlw m0, 1 311.w32_end: 312 pshuflw m0, m0, q0000 313 punpcklqdq m0, m0 314.s32c: 315 mova m1, m0 316 mova m2, m0 317 mova m3, m0 318.s32: 319 mova [dstq+strideq*0+16*0], m0 320 mova [dstq+strideq*0+16*1], m1 321 mova [dstq+strideq*0+16*2], m2 322 mova [dstq+strideq*0+16*3], m3 323 mova [dstq+strideq*1+16*0], m0 324 mova [dstq+strideq*1+16*1], m1 325 mova [dstq+strideq*1+16*2], m2 326 mova [dstq+strideq*1+16*3], m3 327 lea dstq, [dstq+strideq*2] 328 sub hd, 2 329 jg .s32 330 RET 331.h64: 332 mova m0, [tlq-128] 333 mova m1, [tlq-112] 334 paddw m0, [tlq- 96] 335 paddw m1, [tlq- 80] 336 paddw m0, [tlq- 64] 337 paddw m1, [tlq- 48] 338 paddw m0, [tlq- 32] 339 paddw m1, [tlq- 16] 340 paddw m0, m1 341 jmp wq 342.w64: 343 movu m1, [tlq+ 2] 344 movu m2, [tlq+ 18] 345 paddw m1, m2 346 movu m2, [tlq+ 34] 347 paddw m0, m2 348 movu m2, [tlq+ 50] 349 paddw m1, m2 350 movu m2, [tlq+ 66] 351 paddw m0, m2 352 movu m2, [tlq+ 82] 353 paddw m1, m2 354 movu m2, [tlq+ 98] 355 paddw m0, m2 356 movu m2, [tlq+114] 357 paddw m1, m2 358 paddw m0, m1 359 punpcklwd m1, m0, m3 360 punpckhwd m0, m3 361 paddd m0, m1 362 paddd m4, m0 363 punpckhqdq m0, m0 364 paddd m0, m4 365 pshuflw m1, m0, q1032 366 paddd m0, m1 367 psrld m0, m5 368 cmp hd, 64 369 je .w64_end 370 mov r2d, 0xAAAB 371 mov r3d, 0x6667 372 cmp hd, 16 373 cmove r2d, r3d 374 movd m1, r2d 375 pmulhuw m0, m1 376 psrlw m0, 1 377.w64_end: 378 pshuflw m0, m0, q0000 379 punpcklqdq m0, m0 380.s64: 381 mova [dstq+16*0], m0 382 mova [dstq+16*1], m0 383 mova [dstq+16*2], m0 384 mova [dstq+16*3], m0 385 mova [dstq+16*4], m0 386 mova [dstq+16*5], m0 387 mova [dstq+16*6], m0 388 mova [dstq+16*7], m0 389 add dstq, strideq 390 dec hd 391 jg .s64 392 RET 393 394cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 395 mov r6d, r8m 396 LEA r5, ipred_dc_128_16bpc_ssse3_table 397 tzcnt wd, wm 398 shr r6d, 11 399 movifnidn hd, hm 400 movsxd wq, [r5+wq*4] 401 movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8] 402 add wq, r5 403 lea stride3q, [strideq*3] 404 jmp wq 405 406cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 407 LEA r5, ipred_dc_splat_16bpc_ssse3_table 408 movifnidn hd, hm 409 movu m0, [tlq+ 2] 410 movu m1, [tlq+ 18] 411 movu m2, [tlq+ 34] 412 movu m3, [tlq+ 50] 413 cmp wd, 64 414 je .w64 415 tzcnt wd, wd 416 movsxd wq, [r5+wq*4] 417 add wq, r5 418 lea stride3q, [strideq*3] 419 jmp wq 420.w64: 421 WIN64_SPILL_XMM 8 422 movu m4, [tlq+ 66] 423 movu m5, [tlq+ 82] 424 movu m6, [tlq+ 98] 425 movu m7, [tlq+114] 426.w64_loop: 427 mova [dstq+16*0], m0 428 mova [dstq+16*1], m1 429 mova [dstq+16*2], m2 430 mova [dstq+16*3], m3 431 mova [dstq+16*4], m4 432 mova [dstq+16*5], m5 433 mova [dstq+16*6], m6 434 mova [dstq+16*7], m7 435 add dstq, strideq 436 dec hd 437 jg .w64_loop 438 RET 439 440cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 441%define base r5-ipred_h_16bpc_ssse3_table 442 tzcnt wd, wm 443 LEA r5, ipred_h_16bpc_ssse3_table 444 movifnidn hd, hm 445 movsxd wq, [r5+wq*4] 446 movddup m2, [base+pb_0_1] 447 movddup m3, [base+pb_2_3] 448 add wq, r5 449 lea stride3q, [strideq*3] 450 jmp wq 451.w4: 452 sub tlq, 8 453 movq m3, [tlq] 454 pshuflw m0, m3, q3333 455 pshuflw m1, m3, q2222 456 pshuflw m2, m3, q1111 457 pshuflw m3, m3, q0000 458 movq [dstq+strideq*0], m0 459 movq [dstq+strideq*1], m1 460 movq [dstq+strideq*2], m2 461 movq [dstq+stride3q ], m3 462 lea dstq, [dstq+strideq*4] 463 sub hd, 4 464 jg .w4 465 RET 466.w8: 467 sub tlq, 8 468 movq m3, [tlq] 469 punpcklwd m3, m3 470 pshufd m0, m3, q3333 471 pshufd m1, m3, q2222 472 pshufd m2, m3, q1111 473 pshufd m3, m3, q0000 474 mova [dstq+strideq*0], m0 475 mova [dstq+strideq*1], m1 476 mova [dstq+strideq*2], m2 477 mova [dstq+stride3q ], m3 478 lea dstq, [dstq+strideq*4] 479 sub hd, 4 480 jg .w8 481 RET 482.w16: 483 sub tlq, 4 484 movd m1, [tlq] 485 pshufb m0, m1, m3 486 pshufb m1, m2 487 mova [dstq+strideq*0+16*0], m0 488 mova [dstq+strideq*0+16*1], m0 489 mova [dstq+strideq*1+16*0], m1 490 mova [dstq+strideq*1+16*1], m1 491 lea dstq, [dstq+strideq*2] 492 sub hd, 2 493 jg .w16 494 RET 495.w32: 496 sub tlq, 4 497 movd m1, [tlq] 498 pshufb m0, m1, m3 499 pshufb m1, m2 500 mova [dstq+strideq*0+16*0], m0 501 mova [dstq+strideq*0+16*1], m0 502 mova [dstq+strideq*0+16*2], m0 503 mova [dstq+strideq*0+16*3], m0 504 mova [dstq+strideq*1+16*0], m1 505 mova [dstq+strideq*1+16*1], m1 506 mova [dstq+strideq*1+16*2], m1 507 mova [dstq+strideq*1+16*3], m1 508 lea dstq, [dstq+strideq*2] 509 sub hd, 2 510 jg .w32 511 RET 512.w64: 513 sub tlq, 2 514 movd m0, [tlq] 515 pshufb m0, m2 516 mova [dstq+16*0], m0 517 mova [dstq+16*1], m0 518 mova [dstq+16*2], m0 519 mova [dstq+16*3], m0 520 mova [dstq+16*4], m0 521 mova [dstq+16*5], m0 522 mova [dstq+16*6], m0 523 mova [dstq+16*7], m0 524 add dstq, strideq 525 dec hd 526 jg .w64 527 RET 528 529cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left 530%define base r5-ipred_paeth_16bpc_ssse3_table 531 movifnidn hd, hm 532 pshuflw m4, [tlq], q0000 533 mov leftq, tlq 534 add hd, hd 535 punpcklqdq m4, m4 ; topleft 536 sub leftq, hq 537 and wd, ~7 538 jnz .w8 539 movddup m5, [tlq+2] ; top 540 psubw m6, m5, m4 541 pabsw m7, m6 542.w4_loop: 543 movd m1, [leftq+hq-4] 544 punpcklwd m1, m1 545 punpckldq m1, m1 ; left 546%macro PAETH 0 547 paddw m0, m6, m1 548 psubw m2, m4, m0 ; tldiff 549 psubw m0, m5 ; tdiff 550 pabsw m2, m2 551 pabsw m0, m0 552 pminsw m2, m0 553 pcmpeqw m0, m2 554 pand m3, m5, m0 555 pandn m0, m4 556 por m0, m3 557 pcmpgtw m3, m7, m2 558 pand m0, m3 559 pandn m3, m1 560 por m0, m3 561%endmacro 562 PAETH 563 movhps [dstq+strideq*0], m0 564 movq [dstq+strideq*1], m0 565 lea dstq, [dstq+strideq*2] 566 sub hd, 2*2 567 jg .w4_loop 568 RET 569.w8: 570%if ARCH_X86_32 571 PUSH r6 572 %define r7d hm 573 %assign regs_used 7 574%elif WIN64 575 movaps r4m, m8 576 PUSH r7 577 %assign regs_used 8 578%endif 579%if ARCH_X86_64 580 movddup m8, [pb_0_1] 581%endif 582 lea tlq, [tlq+wq*2+2] 583 neg wq 584 mov r7d, hd 585.w8_loop0: 586 movu m5, [tlq+wq*2] 587 mov r6, dstq 588 add dstq, 16 589 psubw m6, m5, m4 590 pabsw m7, m6 591.w8_loop: 592 movd m1, [leftq+hq-2] 593%if ARCH_X86_64 594 pshufb m1, m8 595%else 596 pshuflw m1, m1, q0000 597 punpcklqdq m1, m1 598%endif 599 PAETH 600 mova [r6], m0 601 add r6, strideq 602 sub hd, 1*2 603 jg .w8_loop 604 mov hd, r7d 605 add wq, 8 606 jl .w8_loop0 607%if WIN64 608 movaps m8, r4m 609%endif 610 RET 611 612%if ARCH_X86_64 613DECLARE_REG_TMP 7 614%else 615DECLARE_REG_TMP 4 616%endif 617 618cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights 619 LEA weightsq, smooth_weights_1d_16bpc 620 mov hd, hm 621 lea weightsq, [weightsq+hq*4] 622 neg hq 623 movd m5, [tlq+hq*2] ; bottom 624 pshuflw m5, m5, q0000 625 punpcklqdq m5, m5 626 cmp wd, 4 627 jne .w8 628 movddup m4, [tlq+2] ; top 629 lea r3, [strideq*3] 630 psubw m4, m5 ; top - bottom 631.w4_loop: 632 movq m1, [weightsq+hq*2] 633 punpcklwd m1, m1 634 pshufd m0, m1, q1100 635 punpckhdq m1, m1 636 pmulhrsw m0, m4 637 pmulhrsw m1, m4 638 paddw m0, m5 639 paddw m1, m5 640 movq [dstq+strideq*0], m0 641 movhps [dstq+strideq*1], m0 642 movq [dstq+strideq*2], m1 643 movhps [dstq+r3 ], m1 644 lea dstq, [dstq+strideq*4] 645 add hq, 4 646 jl .w4_loop 647 RET 648.w8: 649%if ARCH_X86_32 650 PUSH r6 651 %assign regs_used 7 652 mov hm, hq 653 %define hq hm 654%elif WIN64 655 PUSH r7 656 %assign regs_used 8 657%endif 658.w8_loop0: 659 mov t0, hq 660 movu m4, [tlq+2] 661 add tlq, 16 662 mov r6, dstq 663 add dstq, 16 664 psubw m4, m5 665.w8_loop: 666 movq m3, [weightsq+t0*2] 667 punpcklwd m3, m3 668 pshufd m0, m3, q0000 669 pshufd m1, m3, q1111 670 pshufd m2, m3, q2222 671 pshufd m3, m3, q3333 672 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 673 REPX {paddw x, m5}, m0, m1, m2, m3 674 mova [r6+strideq*0], m0 675 mova [r6+strideq*1], m1 676 lea r6, [r6+strideq*2] 677 mova [r6+strideq*0], m2 678 mova [r6+strideq*1], m3 679 lea r6, [r6+strideq*2] 680 add t0, 4 681 jl .w8_loop 682 sub wd, 8 683 jg .w8_loop0 684 RET 685 686cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights 687 LEA weightsq, smooth_weights_1d_16bpc 688 mov wd, wm 689 movifnidn hd, hm 690 movd m5, [tlq+wq*2] ; right 691 sub tlq, 8 692 add hd, hd 693 pshuflw m5, m5, q0000 694 sub tlq, hq 695 punpcklqdq m5, m5 696 cmp wd, 4 697 jne .w8 698 movddup m4, [weightsq+4*2] 699 lea r3, [strideq*3] 700.w4_loop: 701 movq m1, [tlq+hq] ; left 702 punpcklwd m1, m1 703 psubw m1, m5 ; left - right 704 pshufd m0, m1, q3322 705 punpckldq m1, m1 706 pmulhrsw m0, m4 707 pmulhrsw m1, m4 708 paddw m0, m5 709 paddw m1, m5 710 movhps [dstq+strideq*0], m0 711 movq [dstq+strideq*1], m0 712 movhps [dstq+strideq*2], m1 713 movq [dstq+r3 ], m1 714 lea dstq, [dstq+strideq*4] 715 sub hd, 4*2 716 jg .w4_loop 717 RET 718.w8: 719 lea weightsq, [weightsq+wq*4] 720 neg wq 721%if ARCH_X86_32 722 PUSH r6 723 %assign regs_used 7 724 %define hd hm 725%elif WIN64 726 PUSH r7 727 %assign regs_used 8 728%endif 729.w8_loop0: 730 mov t0d, hd 731 mova m4, [weightsq+wq*2] 732 mov r6, dstq 733 add dstq, 16 734.w8_loop: 735 movq m3, [tlq+t0*(1+ARCH_X86_32)] 736 punpcklwd m3, m3 737 psubw m3, m5 738 pshufd m0, m3, q3333 739 pshufd m1, m3, q2222 740 pshufd m2, m3, q1111 741 pshufd m3, m3, q0000 742 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 743 REPX {paddw x, m5}, m0, m1, m2, m3 744 mova [r6+strideq*0], m0 745 mova [r6+strideq*1], m1 746 lea r6, [r6+strideq*2] 747 mova [r6+strideq*0], m2 748 mova [r6+strideq*1], m3 749 lea r6, [r6+strideq*2] 750 sub t0d, 4*(1+ARCH_X86_64) 751 jg .w8_loop 752 add wq, 8 753 jl .w8_loop0 754 RET 755 756%if ARCH_X86_64 757DECLARE_REG_TMP 10 758%else 759DECLARE_REG_TMP 3 760%endif 761 762cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \ 763 h_weights, v_weights, top 764 LEA h_weightsq, smooth_weights_2d_16bpc 765 mov wd, wm 766 mov hd, hm 767 movd m7, [tlq+wq*2] ; right 768 lea v_weightsq, [h_weightsq+hq*8] 769 neg hq 770 movd m6, [tlq+hq*2] ; bottom 771 pshuflw m7, m7, q0000 772 pshuflw m6, m6, q0000 773 cmp wd, 4 774 jne .w8 775 movq m4, [tlq+2] ; top 776 mova m5, [h_weightsq+4*4] 777 punpcklwd m4, m6 ; top, bottom 778 pxor m6, m6 779.w4_loop: 780 movq m1, [v_weightsq+hq*4] 781 sub tlq, 4 782 movd m3, [tlq] ; left 783 pshufd m0, m1, q0000 784 pshufd m1, m1, q1111 785 pmaddwd m0, m4 786 punpcklwd m3, m7 ; left, right 787 pmaddwd m1, m4 788 pshufd m2, m3, q1111 789 pshufd m3, m3, q0000 790 pmaddwd m2, m5 791 pmaddwd m3, m5 792 paddd m0, m2 793 paddd m1, m3 794 psrld m0, 8 795 psrld m1, 8 796 packssdw m0, m1 797 pavgw m0, m6 798 movq [dstq+strideq*0], m0 799 movhps [dstq+strideq*1], m0 800 lea dstq, [dstq+strideq*2] 801 add hq, 2 802 jl .w4_loop 803 RET 804.w8: 805%if ARCH_X86_32 806 lea h_weightsq, [h_weightsq+wq*4] 807 mov t0, tlq 808 mov r1m, tlq 809 mov r2m, hq 810 %define m8 [h_weightsq+16*0] 811 %define m9 [h_weightsq+16*1] 812%else 813%if WIN64 814 movaps r4m, m8 815 movaps r6m, m9 816 PUSH r7 817 PUSH r8 818%endif 819 PUSH r9 820 PUSH r10 821 %assign regs_used 11 822 lea h_weightsq, [h_weightsq+wq*8] 823 lea topq, [tlq+wq*2] 824 neg wq 825 mov r8, tlq 826 mov r9, hq 827%endif 828 punpcklqdq m6, m6 829.w8_loop0: 830%if ARCH_X86_32 831 movu m5, [t0+2] 832 add t0, 16 833 mov r0m, t0 834%else 835 movu m5, [topq+wq*2+2] 836 mova m8, [h_weightsq+wq*4+16*0] 837 mova m9, [h_weightsq+wq*4+16*1] 838%endif 839 mov t0, dstq 840 add dstq, 16 841 punpcklwd m4, m5, m6 842 punpckhwd m5, m6 843.w8_loop: 844 movd m1, [v_weightsq+hq*4] 845 sub tlq, 2 846 movd m3, [tlq] ; left 847 pshufd m1, m1, q0000 848 pmaddwd m0, m4, m1 849 pshuflw m3, m3, q0000 850 pmaddwd m1, m5 851 punpcklwd m3, m7 ; left, right 852 pmaddwd m2, m8, m3 853 pmaddwd m3, m9 854 paddd m0, m2 855 paddd m1, m3 856 psrld m0, 8 857 psrld m1, 8 858 packssdw m0, m1 859 pxor m1, m1 860 pavgw m0, m1 861 mova [t0], m0 862 add t0, strideq 863 inc hq 864 jl .w8_loop 865%if ARCH_X86_32 866 mov t0, r0m 867 mov tlq, r1m 868 add h_weightsq, 16*2 869 mov hq, r2m 870 sub dword wm, 8 871 jg .w8_loop0 872%else 873 mov tlq, r8 874 mov hq, r9 875 add wq, 8 876 jl .w8_loop0 877%endif 878%if WIN64 879 movaps m8, r4m 880 movaps m9, r6m 881%endif 882 RET 883 884%if ARCH_X86_64 885cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter 886%else 887cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter 888%define m8 [esp+16*0] 889%define m9 [esp+16*1] 890%define m10 [esp+16*2] 891%define m11 [esp+16*3] 892%define m12 [esp+16*4] 893%define m13 [esp+16*5] 894%define m14 [esp+16*6] 895%define m15 [esp+16*7] 896%endif 897%define base r6-$$ 898 movifnidn hd, hm 899 movd m6, r8m ; bitdepth_max 900%ifidn filterd, filterm 901 movzx filterd, filterb 902%else 903 movzx filterd, byte filterm 904%endif 905 LEA r6, $$ 906 shl filterd, 6 907 movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3 908 mova m1, [base+filter_intra_taps+filterq+16*0] 909 mova m2, [base+filter_intra_taps+filterq+16*1] 910 mova m3, [base+filter_intra_taps+filterq+16*2] 911 mova m4, [base+filter_intra_taps+filterq+16*3] 912 pxor m5, m5 913%if ARCH_X86_64 914 punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper 915 punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid 916 punpcklbw m10, m5, m2 ; having to perform sign-extension. 917 punpckhbw m11, m5, m2 918 punpcklbw m12, m5, m3 919 punpckhbw m13, m5, m3 920 punpcklbw m14, m5, m4 921 punpckhbw m15, m5, m4 922%else 923 punpcklbw m7, m5, m1 924 mova m8, m7 925 punpckhbw m7, m5, m1 926 mova m9, m7 927 punpcklbw m7, m5, m2 928 mova m10, m7 929 punpckhbw m7, m5, m2 930 mova m11, m7 931 punpcklbw m7, m5, m3 932 mova m12, m7 933 punpckhbw m7, m5, m3 934 mova m13, m7 935 punpcklbw m7, m5, m4 936 mova m14, m7 937 punpckhbw m7, m5, m4 938 mova m15, m7 939%endif 940 mova m7, [base+filter_shuf] 941 add hd, hd 942 mov r5, dstq 943 pshuflw m6, m6, q0000 944 mov r6, tlq 945 punpcklqdq m6, m6 946 sub tlq, hq 947.left_loop: 948 pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __ 949 pshufd m1, m0, q0000 950 pmaddwd m2, m8, m1 951 pmaddwd m1, m9 952 pshufd m4, m0, q1111 953 pmaddwd m3, m10, m4 954 pmaddwd m4, m11 955 paddd m2, m3 956 paddd m1, m4 957 pshufd m4, m0, q2222 958 pmaddwd m3, m12, m4 959 pmaddwd m4, m13 960 paddd m2, m3 961 paddd m1, m4 962 pshufd m3, m0, q3333 963 pmaddwd m0, m14, m3 964 pmaddwd m3, m15 965 paddd m0, m2 966 paddd m1, m3 967 psrad m0, 11 ; x >> 3 968 psrad m1, 11 969 packssdw m0, m1 970 pmaxsw m0, m5 971 pavgw m0, m5 ; (x + 8) >> 4 972 pminsw m0, m6 973 movq [dstq+strideq*0], m0 974 movhps [dstq+strideq*1], m0 975 movlps m0, [tlq+hq-10] 976 lea dstq, [dstq+strideq*2] 977 sub hd, 2*2 978 jg .left_loop 979 sub wd, 4 980 jz .end 981 sub tld, r6d ; -h*2 982 sub r6, r5 ; tl-dst 983.right_loop0: 984 add r5, 8 985 mov hd, tld 986 movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __ 987 mov dstq, r5 988.right_loop: 989 pshufd m2, m0, q0000 990 pmaddwd m1, m8, m2 991 pmaddwd m2, m9 992 pshufd m4, m0, q1111 993 pmaddwd m3, m10, m4 994 pmaddwd m4, m11 995 pinsrw m0, [dstq+strideq*0-2], 5 996 paddd m1, m3 997 paddd m2, m4 998 pshufd m0, m0, q2222 999 movddup m4, [dstq+strideq*1-8] 1000 pmaddwd m3, m12, m0 1001 pmaddwd m0, m13 1002 paddd m1, m3 1003 paddd m0, m2 1004 pshuflw m2, m4, q3333 1005 punpcklwd m2, m5 1006 pmaddwd m3, m14, m2 1007 pmaddwd m2, m15 1008 paddd m1, m3 1009 paddd m0, m2 1010 psrad m1, 11 1011 psrad m0, 11 1012 packssdw m0, m1 1013 pmaxsw m0, m5 1014 pavgw m0, m5 1015 pminsw m0, m6 1016 movhps [dstq+strideq*0], m0 1017 movq [dstq+strideq*1], m0 1018 palignr m0, m4, 14 1019 lea dstq, [dstq+strideq*2] 1020 add hd, 2*2 1021 jl .right_loop 1022 sub wd, 4 1023 jg .right_loop0 1024.end: 1025 RET 1026 1027%if UNIX64 1028DECLARE_REG_TMP 7 1029%else 1030DECLARE_REG_TMP 5 1031%endif 1032 1033cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac 1034 LEA t0, ipred_cfl_left_16bpc_ssse3_table 1035 movd m4, wd 1036 tzcnt wd, wd 1037 movifnidn hd, hm 1038 add tlq, 2 1039 movsxd r6, [t0+wq*4] 1040 movd m5, wd 1041 jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start) 1042 1043cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha 1044 movifnidn hd, hm 1045 LEA t0, ipred_cfl_left_16bpc_ssse3_table 1046 tzcnt wd, wm 1047 lea r6d, [hq*2] 1048 movd m4, hd 1049 sub tlq, r6 1050 tzcnt r6d, hd 1051 movd m5, r6d 1052 movsxd r6, [t0+r6*4] 1053.start: 1054 movd m7, r7m 1055 movu m0, [tlq] 1056 add r6, t0 1057 add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table 1058 movsxd wq, [t0+wq*4] 1059 pxor m6, m6 1060 pshuflw m7, m7, q0000 1061 pcmpeqw m3, m3 1062 add wq, t0 1063 movifnidn acq, acmp 1064 pavgw m4, m6 1065 punpcklqdq m7, m7 1066 jmp r6 1067.h32: 1068 movu m1, [tlq+48] 1069 movu m2, [tlq+32] 1070 paddw m0, m1 1071 paddw m0, m2 1072.h16: 1073 movu m1, [tlq+16] 1074 paddw m0, m1 1075.h8: 1076 pshufd m1, m0, q1032 1077 paddw m0, m1 1078.h4: 1079 pmaddwd m0, m3 1080 psubd m4, m0 1081 pshuflw m0, m4, q1032 1082 paddd m0, m4 1083 psrld m0, m5 1084 pshuflw m0, m0, q0000 1085 punpcklqdq m0, m0 1086 jmp wq 1087 1088%macro IPRED_CFL 2 ; dst, src 1089 pabsw m%1, m%2 1090 pmulhrsw m%1, m2 1091 psignw m%2, m1 1092 psignw m%1, m%2 1093 paddw m%1, m0 1094 pmaxsw m%1, m6 1095 pminsw m%1, m7 1096%endmacro 1097 1098cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha 1099 movifnidn hd, hm 1100 tzcnt r6d, hd 1101 lea t0d, [wq+hq] 1102 movd m4, t0d 1103 tzcnt t0d, t0d 1104 movd m5, t0d 1105 LEA t0, ipred_cfl_16bpc_ssse3_table 1106 tzcnt wd, wd 1107 movd m7, r7m 1108 movsxd r6, [t0+r6*4] 1109 movsxd wq, [t0+wq*4+4*4] 1110 psrlw m4, 1 1111 pxor m6, m6 1112 pshuflw m7, m7, q0000 1113 add r6, t0 1114 add wq, t0 1115 movifnidn acq, acmp 1116 pcmpeqw m3, m3 1117 punpcklqdq m7, m7 1118 jmp r6 1119.h4: 1120 movq m0, [tlq-8] 1121 jmp wq 1122.w4: 1123 movq m1, [tlq+2] 1124 paddw m0, m1 1125 pmaddwd m0, m3 1126 psubd m4, m0 1127 pshufd m0, m4, q1032 1128 paddd m0, m4 1129 pshuflw m4, m0, q1032 1130 paddd m0, m4 1131 cmp hd, 4 1132 jg .w4_mul 1133 psrld m0, 3 1134 jmp .w4_end 1135.w4_mul: 1136 mov r6d, 0xAAAB 1137 mov r2d, 0x6667 1138 cmp hd, 16 1139 cmove r6d, r2d 1140 movd m1, r6d 1141 psrld m0, 2 1142 pmulhuw m0, m1 1143 psrlw m0, 1 1144.w4_end: 1145 pshuflw m0, m0, q0000 1146 punpcklqdq m0, m0 1147.s4: 1148 movd m1, alpham 1149 lea r6, [strideq*3] 1150 pshuflw m1, m1, q0000 1151 punpcklqdq m1, m1 1152 pabsw m2, m1 1153 psllw m2, 9 1154.s4_loop: 1155 mova m4, [acq+16*0] 1156 mova m5, [acq+16*1] 1157 add acq, 16*2 1158 IPRED_CFL 3, 4 1159 IPRED_CFL 4, 5 1160 movq [dstq+strideq*0], m3 1161 movhps [dstq+strideq*1], m3 1162 movq [dstq+strideq*2], m4 1163 movhps [dstq+r6 ], m4 1164 lea dstq, [dstq+strideq*4] 1165 sub hd, 4 1166 jg .s4_loop 1167 RET 1168.h8: 1169 mova m0, [tlq-16] 1170 jmp wq 1171.w8: 1172 movu m1, [tlq+2] 1173 paddw m0, m1 1174 pmaddwd m0, m3 1175 psubd m4, m0 1176 pshufd m0, m4, q1032 1177 paddd m0, m4 1178 pshuflw m4, m0, q1032 1179 paddd m0, m4 1180 psrld m0, m5 1181 cmp hd, 8 1182 je .w8_end 1183 mov r6d, 0xAAAB 1184 mov r2d, 0x6667 1185 cmp hd, 32 1186 cmove r6d, r2d 1187 movd m1, r6d 1188 pmulhuw m0, m1 1189 psrlw m0, 1 1190.w8_end: 1191 pshuflw m0, m0, q0000 1192 punpcklqdq m0, m0 1193.s8: 1194 movd m1, alpham 1195 pshuflw m1, m1, q0000 1196 punpcklqdq m1, m1 1197 pabsw m2, m1 1198 psllw m2, 9 1199.s8_loop: 1200 mova m4, [acq+16*0] 1201 mova m5, [acq+16*1] 1202 add acq, 16*2 1203 IPRED_CFL 3, 4 1204 IPRED_CFL 4, 5 1205 mova [dstq+strideq*0], m3 1206 mova [dstq+strideq*1], m4 1207 lea dstq, [dstq+strideq*2] 1208 sub hd, 2 1209 jg .s8_loop 1210 RET 1211.h16: 1212 mova m0, [tlq-32] 1213 paddw m0, [tlq-16] 1214 jmp wq 1215.w16: 1216 movu m1, [tlq+ 2] 1217 movu m2, [tlq+18] 1218 paddw m1, m2 1219 paddw m0, m1 1220 pmaddwd m0, m3 1221 psubd m4, m0 1222 pshufd m0, m4, q1032 1223 paddd m0, m4 1224 pshuflw m4, m0, q1032 1225 paddd m0, m4 1226 psrld m0, m5 1227 cmp hd, 16 1228 je .w16_end 1229 mov r6d, 0xAAAB 1230 mov r2d, 0x6667 1231 test hd, 8|32 1232 cmovz r6d, r2d 1233 movd m1, r6d 1234 pmulhuw m0, m1 1235 psrlw m0, 1 1236.w16_end: 1237 pshuflw m0, m0, q0000 1238 punpcklqdq m0, m0 1239.s16: 1240 movd m1, alpham 1241 pshuflw m1, m1, q0000 1242 punpcklqdq m1, m1 1243 pabsw m2, m1 1244 psllw m2, 9 1245.s16_loop: 1246 mova m4, [acq+16*0] 1247 mova m5, [acq+16*1] 1248 add acq, 16*2 1249 IPRED_CFL 3, 4 1250 IPRED_CFL 4, 5 1251 mova [dstq+16*0], m3 1252 mova [dstq+16*1], m4 1253 add dstq, strideq 1254 dec hd 1255 jg .s16_loop 1256 RET 1257.h32: 1258 mova m0, [tlq-64] 1259 paddw m0, [tlq-48] 1260 paddw m0, [tlq-32] 1261 paddw m0, [tlq-16] 1262 jmp wq 1263.w32: 1264 movu m1, [tlq+ 2] 1265 movu m2, [tlq+18] 1266 paddw m1, m2 1267 movu m2, [tlq+34] 1268 paddw m1, m2 1269 movu m2, [tlq+50] 1270 paddw m1, m2 1271 paddw m0, m1 1272 pmaddwd m0, m3 1273 psubd m4, m0 1274 pshufd m0, m4, q1032 1275 paddd m0, m4 1276 pshuflw m4, m0, q1032 1277 paddd m0, m4 1278 psrld m0, m5 1279 cmp hd, 32 1280 je .w32_end 1281 mov r6d, 0xAAAB 1282 mov r2d, 0x6667 1283 cmp hd, 8 1284 cmove r6d, r2d 1285 movd m1, r6d 1286 pmulhuw m0, m1 1287 psrlw m0, 1 1288.w32_end: 1289 pshuflw m0, m0, q0000 1290 punpcklqdq m0, m0 1291.s32: 1292 movd m1, alpham 1293 pshuflw m1, m1, q0000 1294 punpcklqdq m1, m1 1295 pabsw m2, m1 1296 psllw m2, 9 1297.s32_loop: 1298 mova m4, [acq+16*0] 1299 mova m5, [acq+16*1] 1300 IPRED_CFL 3, 4 1301 IPRED_CFL 4, 5 1302 mova [dstq+16*0], m3 1303 mova [dstq+16*1], m4 1304 mova m4, [acq+16*2] 1305 mova m5, [acq+16*3] 1306 add acq, 16*4 1307 IPRED_CFL 3, 4 1308 IPRED_CFL 4, 5 1309 mova [dstq+16*2], m3 1310 mova [dstq+16*3], m4 1311 add dstq, strideq 1312 dec hd 1313 jg .s32_loop 1314 RET 1315 1316cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac 1317 tzcnt wd, wm 1318 LEA t0, ipred_cfl_splat_16bpc_ssse3_table 1319 mov r6d, r7m 1320 movifnidn hd, hm 1321 shr r6d, 11 1322 movd m7, r7m 1323 movsxd wq, [t0+wq*4] 1324 movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8] 1325 pshuflw m7, m7, q0000 1326 pxor m6, m6 1327 add wq, t0 1328 movifnidn acq, acmp 1329 punpcklqdq m7, m7 1330 jmp wq 1331 1332cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h 1333 movifnidn hpadd, hpadm 1334%if ARCH_X86_32 && PIC 1335 pcmpeqw m5, m5 1336 pabsw m5, m5 1337 paddw m5, m5 1338%else 1339 movddup m5, [pw_2] 1340%endif 1341 mov hd, hm 1342 shl hpadd, 2 1343 pxor m4, m4 1344 sub hd, hpadd 1345 cmp dword wm, 8 1346 mov r5, acq 1347 jg .w16 1348 je .w8 1349 lea r3, [strideq*3] 1350.w4_loop: 1351 pmaddwd m0, m5, [ypxq+strideq*0] 1352 pmaddwd m1, m5, [ypxq+strideq*1] 1353 pmaddwd m2, m5, [ypxq+strideq*2] 1354 pmaddwd m3, m5, [ypxq+r3 ] 1355 lea ypxq, [ypxq+strideq*4] 1356 paddd m0, m1 1357 paddd m2, m3 1358 paddd m4, m0 1359 packssdw m0, m2 1360 paddd m4, m2 1361 mova [acq], m0 1362 add acq, 16 1363 sub hd, 2 1364 jg .w4_loop 1365 test hpadd, hpadd 1366 jz .dc 1367 punpckhqdq m0, m0 1368 pslld m2, 2 1369.w4_hpad: 1370 mova [acq+16*0], m0 1371 paddd m4, m2 1372 mova [acq+16*1], m0 1373 add acq, 16*2 1374 sub hpadd, 4 1375 jg .w4_hpad 1376 jmp .dc 1377.w8: 1378%if ARCH_X86_32 1379 cmp dword wpadm, 0 1380%else 1381 test wpadd, wpadd 1382%endif 1383 jnz .w8_wpad1 1384.w8_loop: 1385 pmaddwd m0, m5, [ypxq+strideq*0+16*0] 1386 pmaddwd m2, m5, [ypxq+strideq*1+16*0] 1387 pmaddwd m1, m5, [ypxq+strideq*0+16*1] 1388 pmaddwd m3, m5, [ypxq+strideq*1+16*1] 1389 lea ypxq, [ypxq+strideq*2] 1390 paddd m0, m2 1391 paddd m1, m3 1392 paddd m2, m0, m1 1393 packssdw m0, m1 1394 paddd m4, m2 1395 mova [acq], m0 1396 add acq, 16 1397 dec hd 1398 jg .w8_loop 1399.w8_hpad: 1400 test hpadd, hpadd 1401 jz .dc 1402 pslld m2, 2 1403 mova m1, m0 1404 jmp .hpad 1405.w8_wpad1: 1406 pmaddwd m0, m5, [ypxq+strideq*0] 1407 pmaddwd m1, m5, [ypxq+strideq*1] 1408 lea ypxq, [ypxq+strideq*2] 1409 paddd m0, m1 1410 pshufd m1, m0, q3333 1411 paddd m2, m0, m1 1412 packssdw m0, m1 1413 paddd m4, m2 1414 mova [acq], m0 1415 add acq, 16 1416 dec hd 1417 jg .w8_wpad1 1418 jmp .w8_hpad 1419.w16_wpad3: 1420 pshufd m3, m0, q3333 1421 mova m1, m3 1422 mova m2, m3 1423 jmp .w16_wpad_end 1424.w16_wpad2: 1425 pshufd m1, m3, q3333 1426 mova m2, m1 1427 jmp .w16_wpad_end 1428.w16_wpad1: 1429 pshufd m2, m1, q3333 1430 jmp .w16_wpad_end 1431.w16: 1432 movifnidn wpadd, wpadm 1433 WIN64_SPILL_XMM 7 1434.w16_loop: 1435 pmaddwd m0, m5, [ypxq+strideq*0+16*0] 1436 pmaddwd m6, m5, [ypxq+strideq*1+16*0] 1437 paddd m0, m6 1438 cmp wpadd, 2 1439 jg .w16_wpad3 1440 pmaddwd m3, m5, [ypxq+strideq*0+16*1] 1441 pmaddwd m6, m5, [ypxq+strideq*1+16*1] 1442 paddd m3, m6 1443 je .w16_wpad2 1444 pmaddwd m1, m5, [ypxq+strideq*0+16*2] 1445 pmaddwd m6, m5, [ypxq+strideq*1+16*2] 1446 paddd m1, m6 1447 jp .w16_wpad1 1448 pmaddwd m2, m5, [ypxq+strideq*0+16*3] 1449 pmaddwd m6, m5, [ypxq+strideq*1+16*3] 1450 paddd m2, m6 1451.w16_wpad_end: 1452 lea ypxq, [ypxq+strideq*2] 1453 paddd m6, m0, m3 1454 packssdw m0, m3 1455 paddd m6, m1 1456 mova [acq+16*0], m0 1457 packssdw m1, m2 1458 paddd m2, m6 1459 mova [acq+16*1], m1 1460 add acq, 16*2 1461 paddd m4, m2 1462 dec hd 1463 jg .w16_loop 1464 WIN64_RESTORE_XMM 1465 add hpadd, hpadd 1466 jz .dc 1467 paddd m2, m2 1468.hpad: 1469 mova [acq+16*0], m0 1470 mova [acq+16*1], m1 1471 paddd m4, m2 1472 mova [acq+16*2], m0 1473 mova [acq+16*3], m1 1474 add acq, 16*4 1475 sub hpadd, 4 1476 jg .hpad 1477.dc: 1478 sub r5, acq ; -w*h*2 1479 pshufd m2, m4, q1032 1480 tzcnt r1d, r5d 1481 paddd m2, m4 1482 sub r1d, 2 1483 pshufd m4, m2, q2301 1484 movd m0, r1d 1485 paddd m2, m4 1486 psrld m2, m0 1487 pxor m0, m0 1488 pavgw m2, m0 1489 packssdw m2, m2 1490.dc_loop: 1491 mova m0, [acq+r5+16*0] 1492 mova m1, [acq+r5+16*1] 1493 psubw m0, m2 1494 psubw m1, m2 1495 mova [acq+r5+16*0], m0 1496 mova [acq+r5+16*1], m1 1497 add r5, 16*2 1498 jl .dc_loop 1499 RET 1500 1501cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h 1502 movifnidn hpadd, hpadm 1503%if ARCH_X86_32 && PIC 1504 pcmpeqw m5, m5 1505 pabsw m5, m5 1506 psllw m5, 2 1507%else 1508 movddup m5, [pw_4] 1509%endif 1510 mov hd, hm 1511 shl hpadd, 2 1512 pxor m4, m4 1513 sub hd, hpadd 1514 cmp dword wm, 8 1515 mov r5, acq 1516 jg .w16 1517 je .w8 1518 lea r3, [strideq*3] 1519.w4_loop: 1520 pmaddwd m0, m5, [ypxq+strideq*0] 1521 pmaddwd m3, m5, [ypxq+strideq*1] 1522 pmaddwd m1, m5, [ypxq+strideq*2] 1523 pmaddwd m2, m5, [ypxq+r3 ] 1524 lea ypxq, [ypxq+strideq*4] 1525 paddd m4, m0 1526 packssdw m0, m3 1527 paddd m3, m1 1528 packssdw m1, m2 1529 paddd m4, m2 1530 paddd m4, m3 1531 mova [acq+16*0], m0 1532 mova [acq+16*1], m1 1533 add acq, 16*2 1534 sub hd, 4 1535 jg .w4_loop 1536 test hpadd, hpadd 1537 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 1538 punpckhqdq m1, m1 1539 pslld m2, 3 1540 mova [acq+16*0], m1 1541 mova [acq+16*1], m1 1542 paddd m4, m2 1543 mova [acq+16*2], m1 1544 mova [acq+16*3], m1 1545 add acq, 16*4 1546 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 1547.w8: 1548%if ARCH_X86_32 1549 cmp dword wpadm, 0 1550%else 1551 test wpadd, wpadd 1552%endif 1553 jnz .w8_wpad1 1554.w8_loop: 1555 pmaddwd m0, m5, [ypxq+strideq*0+16*0] 1556 pmaddwd m2, m5, [ypxq+strideq*0+16*1] 1557 pmaddwd m1, m5, [ypxq+strideq*1+16*0] 1558 pmaddwd m3, m5, [ypxq+strideq*1+16*1] 1559 lea ypxq, [ypxq+strideq*2] 1560 paddd m4, m0 1561 packssdw m0, m2 1562 paddd m4, m2 1563 mova [acq+16*0], m0 1564 paddd m2, m1, m3 1565 packssdw m1, m3 1566 paddd m4, m2 1567 mova [acq+16*1], m1 1568 add acq, 16*2 1569 sub hd, 2 1570 jg .w8_loop 1571.w8_hpad: 1572 test hpadd, hpadd 1573 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 1574 pslld m2, 2 1575 mova m0, m1 1576 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad 1577.w8_wpad1: 1578 pmaddwd m0, m5, [ypxq+strideq*0] 1579 pmaddwd m1, m5, [ypxq+strideq*1] 1580 lea ypxq, [ypxq+strideq*2] 1581 pshufd m2, m0, q3333 1582 pshufd m3, m1, q3333 1583 paddd m4, m0 1584 packssdw m0, m2 1585 paddd m4, m2 1586 paddd m2, m1, m3 1587 packssdw m1, m3 1588 paddd m4, m2 1589 mova [acq+16*0], m0 1590 mova [acq+16*1], m1 1591 add acq, 16*2 1592 sub hd, 2 1593 jg .w8_wpad1 1594 jmp .w8_hpad 1595.w16_wpad3: 1596 pshufd m3, m0, q3333 1597 mova m1, m3 1598 mova m2, m3 1599 jmp .w16_wpad_end 1600.w16_wpad2: 1601 pshufd m1, m3, q3333 1602 mova m2, m1 1603 jmp .w16_wpad_end 1604.w16_wpad1: 1605 pshufd m2, m1, q3333 1606 jmp .w16_wpad_end 1607.w16: 1608 movifnidn wpadd, wpadm 1609 WIN64_SPILL_XMM 7 1610.w16_loop: 1611 pmaddwd m0, m5, [ypxq+16*0] 1612 cmp wpadd, 2 1613 jg .w16_wpad3 1614 pmaddwd m3, m5, [ypxq+16*1] 1615 je .w16_wpad2 1616 pmaddwd m1, m5, [ypxq+16*2] 1617 jp .w16_wpad1 1618 pmaddwd m2, m5, [ypxq+16*3] 1619.w16_wpad_end: 1620 add ypxq, strideq 1621 paddd m6, m0, m3 1622 packssdw m0, m3 1623 mova [acq+16*0], m0 1624 paddd m6, m1 1625 packssdw m1, m2 1626 paddd m2, m6 1627 mova [acq+16*1], m1 1628 add acq, 16*2 1629 paddd m4, m2 1630 dec hd 1631 jg .w16_loop 1632 WIN64_RESTORE_XMM 1633 add hpadd, hpadd 1634 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 1635 paddd m2, m2 1636 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad 1637 1638cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h 1639%define base r6-ipred_cfl_ac_444_16bpc_ssse3_table 1640 LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table 1641 tzcnt wd, wm 1642 movifnidn hpadd, hpadm 1643 pxor m4, m4 1644 movsxd wq, [r6+wq*4] 1645 movddup m5, [base+pw_1] 1646 add wq, r6 1647 mov hd, hm 1648 shl hpadd, 2 1649 sub hd, hpadd 1650 jmp wq 1651.w4: 1652 lea r3, [strideq*3] 1653 mov r5, acq 1654.w4_loop: 1655 movq m0, [ypxq+strideq*0] 1656 movhps m0, [ypxq+strideq*1] 1657 movq m1, [ypxq+strideq*2] 1658 movhps m1, [ypxq+r3 ] 1659 lea ypxq, [ypxq+strideq*4] 1660 psllw m0, 3 1661 psllw m1, 3 1662 mova [acq+16*0], m0 1663 pmaddwd m0, m5 1664 mova [acq+16*1], m1 1665 pmaddwd m2, m5, m1 1666 add acq, 16*2 1667 paddd m4, m0 1668 paddd m4, m2 1669 sub hd, 4 1670 jg .w4_loop 1671 test hpadd, hpadd 1672 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 1673 punpckhqdq m1, m1 1674 mova [acq+16*0], m1 1675 pslld m2, 2 1676 mova [acq+16*1], m1 1677 punpckhqdq m2, m2 1678 mova [acq+16*2], m1 1679 paddd m4, m2 1680 mova [acq+16*3], m1 1681 add acq, 16*4 1682 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 1683.w8: 1684 mov r5, acq 1685.w8_loop: 1686 mova m0, [ypxq+strideq*0] 1687 mova m1, [ypxq+strideq*1] 1688 lea ypxq, [ypxq+strideq*2] 1689 psllw m0, 3 1690 psllw m1, 3 1691 mova [acq+16*0], m0 1692 pmaddwd m0, m5 1693 mova [acq+16*1], m1 1694 pmaddwd m2, m5, m1 1695 add acq, 16*2 1696 paddd m4, m0 1697 paddd m4, m2 1698 sub hd, 2 1699 jg .w8_loop 1700.w8_hpad: 1701 test hpadd, hpadd 1702 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 1703 pslld m2, 2 1704 mova m0, m1 1705 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad 1706.w16_wpad2: 1707 pshufhw m3, m2, q3333 1708 pshufhw m1, m0, q3333 1709 punpckhqdq m3, m3 1710 punpckhqdq m1, m1 1711 jmp .w16_wpad_end 1712.w16: 1713 movifnidn wpadd, wpadm 1714 mov r5, acq 1715.w16_loop: 1716 mova m2, [ypxq+strideq*0+16*0] 1717 mova m0, [ypxq+strideq*1+16*0] 1718 psllw m2, 3 1719 psllw m0, 3 1720 test wpadd, wpadd 1721 jnz .w16_wpad2 1722 mova m3, [ypxq+strideq*0+16*1] 1723 mova m1, [ypxq+strideq*1+16*1] 1724 psllw m3, 3 1725 psllw m1, 3 1726.w16_wpad_end: 1727 lea ypxq, [ypxq+strideq*2] 1728 mova [acq+16*0], m2 1729 pmaddwd m2, m5 1730 mova [acq+16*1], m3 1731 pmaddwd m3, m5 1732 paddd m4, m2 1733 pmaddwd m2, m5, m0 1734 mova [acq+16*2], m0 1735 paddd m4, m3 1736 pmaddwd m3, m5, m1 1737 mova [acq+16*3], m1 1738 add acq, 16*4 1739 paddd m2, m3 1740 paddd m4, m2 1741 sub hd, 2 1742 jg .w16_loop 1743 add hpadd, hpadd 1744 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 1745 paddd m2, m2 1746 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad 1747.w32_wpad6: 1748 pshufhw m1, m0, q3333 1749 punpckhqdq m1, m1 1750 mova m2, m1 1751 mova m3, m1 1752 jmp .w32_wpad_end 1753.w32_wpad4: 1754 pshufhw m2, m1, q3333 1755 punpckhqdq m2, m2 1756 mova m3, m2 1757 jmp .w32_wpad_end 1758.w32_wpad2: 1759 pshufhw m3, m2, q3333 1760 punpckhqdq m3, m3 1761 jmp .w32_wpad_end 1762.w32: 1763 movifnidn wpadd, wpadm 1764 mov r5, acq 1765 WIN64_SPILL_XMM 8 1766.w32_loop: 1767 mova m0, [ypxq+16*0] 1768 psllw m0, 3 1769 cmp wpadd, 4 1770 jg .w32_wpad6 1771 mova m1, [ypxq+16*1] 1772 psllw m1, 3 1773 je .w32_wpad4 1774 mova m2, [ypxq+16*2] 1775 psllw m2, 3 1776 jnp .w32_wpad2 1777 mova m3, [ypxq+16*3] 1778 psllw m3, 3 1779.w32_wpad_end: 1780 add ypxq, strideq 1781 pmaddwd m6, m5, m0 1782 mova [acq+16*0], m0 1783 pmaddwd m7, m5, m1 1784 mova [acq+16*1], m1 1785 paddd m6, m7 1786 pmaddwd m7, m5, m2 1787 mova [acq+16*2], m2 1788 paddd m6, m7 1789 pmaddwd m7, m5, m3 1790 mova [acq+16*3], m3 1791 add acq, 16*4 1792 paddd m6, m7 1793 paddd m4, m6 1794 dec hd 1795 jg .w32_loop 1796%if WIN64 1797 mova m5, m6 1798 WIN64_RESTORE_XMM 1799 SWAP 5, 6 1800%endif 1801 test hpadd, hpadd 1802 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 1803.w32_hpad_loop: 1804 mova [acq+16*0], m0 1805 mova [acq+16*1], m1 1806 paddd m4, m6 1807 mova [acq+16*2], m2 1808 mova [acq+16*3], m3 1809 add acq, 16*4 1810 dec hpadd 1811 jg .w32_hpad_loop 1812 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 1813 1814cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h 1815%define base r2-pal_pred_16bpc_ssse3_table 1816%if ARCH_X86_32 1817 %define hd r2d 1818%endif 1819 mova m3, [palq] 1820 LEA r2, pal_pred_16bpc_ssse3_table 1821 tzcnt wd, wm 1822 pshufb m3, [base+pal_pred_shuf] 1823 movsxd wq, [r2+wq*4] 1824 pshufd m4, m3, q1032 1825 add wq, r2 1826 movifnidn hd, hm 1827 jmp wq 1828.w4: 1829 mova m0, [idxq] 1830 add idxq, 16 1831 pshufb m1, m3, m0 1832 pshufb m2, m4, m0 1833 punpcklbw m0, m1, m2 1834 punpckhbw m1, m2 1835 movq [dstq+strideq*0], m0 1836 movhps [dstq+strideq*1], m0 1837 lea dstq, [dstq+strideq*2] 1838 movq [dstq+strideq*0], m1 1839 movhps [dstq+strideq*1], m1 1840 lea dstq, [dstq+strideq*2] 1841 sub hd, 4 1842 jg .w4 1843 RET 1844.w8: 1845 mova m0, [idxq] 1846 add idxq, 16 1847 pshufb m1, m3, m0 1848 pshufb m2, m4, m0 1849 punpcklbw m0, m1, m2 1850 punpckhbw m1, m2 1851 mova [dstq+strideq*0], m0 1852 mova [dstq+strideq*1], m1 1853 lea dstq, [dstq+strideq*2] 1854 sub hd, 2 1855 jg .w8 1856 RET 1857.w16: 1858 mova m0, [idxq] 1859 add idxq, 16 1860 pshufb m1, m3, m0 1861 pshufb m2, m4, m0 1862 punpcklbw m0, m1, m2 1863 punpckhbw m1, m2 1864 mova [dstq+16*0], m0 1865 mova [dstq+16*1], m1 1866 add dstq, strideq 1867 dec hd 1868 jg .w16 1869 RET 1870.w32: 1871 mova m0, [idxq+16*0] 1872 pshufb m1, m3, m0 1873 pshufb m2, m4, m0 1874 punpcklbw m0, m1, m2 1875 punpckhbw m1, m2 1876 mova m2, [idxq+16*1] 1877 add idxq, 16*2 1878 mova [dstq+16*0], m0 1879 pshufb m0, m3, m2 1880 mova [dstq+16*1], m1 1881 pshufb m1, m4, m2 1882 punpcklbw m2, m0, m1 1883 punpckhbw m0, m1 1884 mova [dstq+16*2], m2 1885 mova [dstq+16*3], m0 1886 add dstq, strideq 1887 dec hd 1888 jg .w32 1889 RET 1890.w64: 1891 mova m0, [idxq+16*0] 1892 pshufb m1, m3, m0 1893 pshufb m2, m4, m0 1894 punpcklbw m0, m1, m2 1895 punpckhbw m1, m2 1896 mova m2, [idxq+16*1] 1897 mova [dstq+16*0], m0 1898 pshufb m0, m3, m2 1899 mova [dstq+16*1], m1 1900 pshufb m1, m4, m2 1901 punpcklbw m2, m0, m1 1902 punpckhbw m0, m1 1903 mova m1, [idxq+16*2] 1904 mova [dstq+16*2], m2 1905 pshufb m2, m3, m1 1906 mova [dstq+16*3], m0 1907 pshufb m0, m4, m1 1908 punpcklbw m1, m2, m0 1909 punpckhbw m2, m0 1910 mova m0, [idxq+16*3] 1911 add idxq, 16*4 1912 mova [dstq+16*4], m1 1913 pshufb m1, m3, m0 1914 mova [dstq+16*5], m2 1915 pshufb m2, m4, m0 1916 punpcklbw m0, m1, m2 1917 punpckhbw m1, m2 1918 mova [dstq+16*6], m0 1919 mova [dstq+16*7], m1 1920 add dstq, strideq 1921 dec hd 1922 jg .w64 1923 RET 1924