1; Copyright © 2018-2021, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 16 30 31%macro SMOOTH_WEIGHT_TABLE 1-* 32 %rep %0 33 db %1-128, 127-%1 34 %rotate 1 35 %endrep 36%endmacro 37 38; sm_weights[], but modified to precalculate x and 256-x with offsets to 39; enable efficient use of pmaddubsw (which requires signed values) 40smooth_weights: SMOOTH_WEIGHT_TABLE \ 41 0, 0, 255, 128, 255, 149, 85, 64, \ 42 255, 197, 146, 105, 73, 50, 37, 32, \ 43 255, 225, 196, 170, 145, 123, 102, 84, \ 44 68, 54, 43, 33, 26, 20, 17, 16, \ 45 255, 240, 225, 210, 196, 182, 169, 157, \ 46 145, 133, 122, 111, 101, 92, 83, 74, \ 47 66, 59, 52, 45, 39, 34, 29, 25, \ 48 21, 17, 14, 12, 10, 9, 8, 8, \ 49 255, 248, 240, 233, 225, 218, 210, 203, \ 50 196, 189, 182, 176, 169, 163, 156, 150, \ 51 144, 138, 133, 127, 121, 116, 111, 106, \ 52 101, 96, 91, 86, 82, 77, 73, 69, \ 53 65, 61, 57, 54, 50, 47, 44, 41, \ 54 38, 35, 32, 29, 27, 25, 22, 20, \ 55 18, 16, 15, 13, 12, 10, 9, 8, \ 56 7, 6, 6, 5, 5, 4, 4, 4 57 58ipred_v_shuf : db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 59ipred_h_shuf : db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 60ipred_paeth_shuf : db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 61filter_shuf1 : db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 62filter_shuf2 : db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1 63 64pw_8 : times 8 dw 8 65pb_3 : times 16 db 3 66pb_128 : times 8 db 128 67pw_128 : times 4 dw 128 68pw_255 : times 4 dw 255 69pb_2 : times 8 db 2 70pb_4 : times 8 db 4 71pb_127_m127 : times 4 db 127, -127 72pd_32768 : times 1 dd 32768 73 74 75%macro JMP_TABLE 3-* 76 %xdefine %1_%2_table (%%table - 2*4) 77 %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) 78 %%table: 79 %rep %0 - 2 80 dd %%base %+ .%3 - (%%table - 2*4) 81 %rotate 1 82 %endrep 83%endmacro 84 85%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4) 86%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4) 87 88JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64 89JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ 90 s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 91JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64 92JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64 93JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64 94JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64 95JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64 96JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64 97JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ 98 s4-8*4, s8-8*4, s16-8*4, s32-8*4 99JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32 100JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32 101 102cextern filter_intra_taps 103 104 105SECTION .text 106 107;--------------------------------------------------------------------------------------- 108;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 109; const int width, const int height, const int a); 110;--------------------------------------------------------------------------------------- 111%macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8 112 pshuflw m1, m0, %3 ; extend 8 byte for 2 pos 113 punpcklqdq m1, m1 114 mova [dstq + %2], m1 115%if %1 > 16 116 mova [dstq + 16 + %2], m1 117%endif 118%if %1 > 32 119 mova [dstq + 32 + %2], m1 120 mova [dstq + 48 + %2], m1 121%endif 122%endmacro 123 124%macro IPRED_H 1 ; width 125 sub tlq, 4 126 movd m0, [tlq] ; get 4 bytes of topleft data 127 punpcklbw m0, m0 ; extend 2 byte 128%if %1 == 4 129 pshuflw m1, m0, q2233 130 movd [dstq+strideq*0], m1 131 psrlq m1, 32 132 movd [dstq+strideq*1], m1 133 pshuflw m0, m0, q0011 134 movd [dstq+strideq*2], m0 135 psrlq m0, 32 136 movd [dstq+stride3q ], m0 137 138%elif %1 == 8 139 punpcklwd m0, m0 140 punpckhdq m1, m0, m0 141 punpckldq m0, m0 142 movq [dstq+strideq*1], m1 143 movhps [dstq+strideq*0], m1 144 movq [dstq+stride3q ], m0 145 movhps [dstq+strideq*2], m0 146%else 147 IPRED_SET %1, 0, q3333 148 IPRED_SET %1, strideq, q2222 149 IPRED_SET %1, strideq*2, q1111 150 IPRED_SET %1, stride3q, q0000 151%endif 152 lea dstq, [dstq+strideq*4] 153 sub hd, 4 154 jg .w%1 155 RET 156%endmacro 157 158INIT_XMM ssse3 159cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3 160 LEA r5, ipred_h_ssse3_table 161 tzcnt wd, wm 162 movifnidn hd, hm 163 movsxd wq, [r5+wq*4] 164 add wq, r5 165 lea stride3q, [strideq*3] 166 jmp wq 167.w4: 168 IPRED_H 4 169.w8: 170 IPRED_H 8 171.w16: 172 IPRED_H 16 173.w32: 174 IPRED_H 32 175.w64: 176 IPRED_H 64 177 178;--------------------------------------------------------------------------------------- 179;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 180; const int width, const int height, const int a); 181;--------------------------------------------------------------------------------------- 182cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 183 LEA r5, ipred_dc_splat_ssse3_table 184 tzcnt wd, wm 185 movu m0, [tlq+ 1] 186 movu m1, [tlq+17] 187 movu m2, [tlq+33] 188 movu m3, [tlq+49] 189 movifnidn hd, hm 190 movsxd wq, [r5+wq*4] 191 add wq, r5 192 lea stride3q, [strideq*3] 193 jmp wq 194 195;--------------------------------------------------------------------------------------- 196;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 197; const int width, const int height, const int a); 198;--------------------------------------------------------------------------------------- 199cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 200 movifnidn hd, hm 201 movifnidn wd, wm 202 tzcnt r6d, hd 203 lea r5d, [wq+hq] 204 movd m4, r5d 205 tzcnt r5d, r5d 206 movd m5, r5d 207 LEA r5, ipred_dc_ssse3_table 208 tzcnt wd, wd 209 movsxd r6, [r5+r6*4] 210 movsxd wq, [r5+wq*4+20] 211 pcmpeqd m3, m3 212 psrlw m4, 1 ; dc = (width + height) >> 1; 213 add r6, r5 214 add wq, r5 215 lea stride3q, [strideq*3] 216 jmp r6 217.h4: 218 movd m0, [tlq-4] 219 pmaddubsw m0, m3 220 jmp wq 221.w4: 222 movd m1, [tlq+1] 223 pmaddubsw m1, m3 224 psubw m0, m4 225 paddw m0, m1 226 pmaddwd m0, m3 227 cmp hd, 4 228 jg .w4_mul 229 psrlw m0, 3 ; dc >>= ctz(width + height); 230 jmp .w4_end 231.w4_mul: 232 punpckhqdq m1, m0, m0 233 paddw m0, m1 234 psrlq m1, m0, 32 235 paddw m0, m1 236 psrlw m0, 2 237 mov r6d, 0x5556 238 mov r2d, 0x3334 239 test hd, 8 240 cmovz r6d, r2d 241 movd m5, r6d 242 pmulhuw m0, m5 243.w4_end: 244 pxor m1, m1 245 pshufb m0, m1 246.s4: 247 movd [dstq+strideq*0], m0 248 movd [dstq+strideq*1], m0 249 movd [dstq+strideq*2], m0 250 movd [dstq+stride3q ], m0 251 lea dstq, [dstq+strideq*4] 252 sub hd, 4 253 jg .s4 254 RET 255ALIGN function_align 256.h8: 257 movq m0, [tlq-8] 258 pmaddubsw m0, m3 259 jmp wq 260.w8: 261 movq m1, [tlq+1] 262 pmaddubsw m1, m3 263 psubw m4, m0 264 punpckhqdq m0, m0 265 psubw m0, m4 266 paddw m0, m1 267 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 268 paddw m0, m1 269 pmaddwd m0, m3 270 psrlw m0, m5 271 cmp hd, 8 272 je .w8_end 273 mov r6d, 0x5556 274 mov r2d, 0x3334 275 cmp hd, 32 276 cmovz r6d, r2d 277 movd m1, r6d 278 pmulhuw m0, m1 279.w8_end: 280 pxor m1, m1 281 pshufb m0, m1 282.s8: 283 movq [dstq+strideq*0], m0 284 movq [dstq+strideq*1], m0 285 movq [dstq+strideq*2], m0 286 movq [dstq+stride3q ], m0 287 lea dstq, [dstq+strideq*4] 288 sub hd, 4 289 jg .s8 290 RET 291ALIGN function_align 292.h16: 293 mova m0, [tlq-16] 294 pmaddubsw m0, m3 295 jmp wq 296.w16: 297 movu m1, [tlq+1] 298 pmaddubsw m1, m3 299 paddw m0, m1 300 psubw m4, m0 301 punpckhqdq m0, m0 302 psubw m0, m4 303 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 304 paddw m0, m1 305 pmaddwd m0, m3 306 psrlw m0, m5 307 cmp hd, 16 308 je .w16_end 309 mov r6d, 0x5556 310 mov r2d, 0x3334 311 test hd, 8|32 312 cmovz r6d, r2d 313 movd m1, r6d 314 pmulhuw m0, m1 315.w16_end: 316 pxor m1, m1 317 pshufb m0, m1 318.s16: 319 mova [dstq+strideq*0], m0 320 mova [dstq+strideq*1], m0 321 mova [dstq+strideq*2], m0 322 mova [dstq+stride3q ], m0 323 lea dstq, [dstq+strideq*4] 324 sub hd, 4 325 jg .s16 326 RET 327ALIGN function_align 328.h32: 329 mova m0, [tlq-32] 330 pmaddubsw m0, m3 331 mova m2, [tlq-16] 332 pmaddubsw m2, m3 333 paddw m0, m2 334 jmp wq 335.w32: 336 movu m1, [tlq+1] 337 pmaddubsw m1, m3 338 movu m2, [tlq+17] 339 pmaddubsw m2, m3 340 paddw m1, m2 341 paddw m0, m1 342 psubw m4, m0 343 punpckhqdq m0, m0 344 psubw m0, m4 345 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 346 paddw m0, m1 347 pmaddwd m0, m3 348 psrlw m0, m5 349 cmp hd, 32 350 je .w32_end 351 lea r2d, [hq*2] 352 mov r6d, 0x5556 353 mov r2d, 0x3334 354 test hd, 64|16 355 cmovz r6d, r2d 356 movd m1, r6d 357 pmulhuw m0, m1 358.w32_end: 359 pxor m1, m1 360 pshufb m0, m1 361 mova m1, m0 362.s32: 363 mova [dstq], m0 364 mova [dstq+16], m1 365 mova [dstq+strideq], m0 366 mova [dstq+strideq+16], m1 367 mova [dstq+strideq*2], m0 368 mova [dstq+strideq*2+16], m1 369 mova [dstq+stride3q], m0 370 mova [dstq+stride3q+16], m1 371 lea dstq, [dstq+strideq*4] 372 sub hd, 4 373 jg .s32 374 RET 375ALIGN function_align 376.h64: 377 mova m0, [tlq-64] 378 mova m1, [tlq-48] 379 pmaddubsw m0, m3 380 pmaddubsw m1, m3 381 paddw m0, m1 382 mova m1, [tlq-32] 383 pmaddubsw m1, m3 384 paddw m0, m1 385 mova m1, [tlq-16] 386 pmaddubsw m1, m3 387 paddw m0, m1 388 jmp wq 389.w64: 390 movu m1, [tlq+ 1] 391 movu m2, [tlq+17] 392 pmaddubsw m1, m3 393 pmaddubsw m2, m3 394 paddw m1, m2 395 movu m2, [tlq+33] 396 pmaddubsw m2, m3 397 paddw m1, m2 398 movu m2, [tlq+49] 399 pmaddubsw m2, m3 400 paddw m1, m2 401 paddw m0, m1 402 psubw m4, m0 403 punpckhqdq m0, m0 404 psubw m0, m4 405 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 406 paddw m0, m1 407 pmaddwd m0, m3 408 psrlw m0, m5 409 cmp hd, 64 410 je .w64_end 411 mov r6d, 0x5556 412 mov r2d, 0x3334 413 test hd, 32 414 cmovz r6d, r2d 415 movd m1, r6d 416 pmulhuw m0, m1 417.w64_end: 418 pxor m1, m1 419 pshufb m0, m1 420 mova m1, m0 421 mova m2, m0 422 mova m3, m0 423.s64: 424 mova [dstq], m0 425 mova [dstq+16], m1 426 mova [dstq+32], m2 427 mova [dstq+48], m3 428 mova [dstq+strideq], m0 429 mova [dstq+strideq+16], m1 430 mova [dstq+strideq+32], m2 431 mova [dstq+strideq+48], m3 432 lea dstq, [dstq+strideq*2] 433 sub hd, 2 434 jg .s64 435 RET 436 437;--------------------------------------------------------------------------------------- 438;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 439; const int width, const int height, const int a); 440;--------------------------------------------------------------------------------------- 441cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 442 LEA r5, ipred_dc_left_ssse3_table 443 mov hd, hm ; zero upper half 444 tzcnt r6d, hd 445 sub tlq, hq 446 tzcnt wd, wm 447 movu m0, [tlq] 448 movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] 449 movd m2, r6d 450 psrld m3, m2 451 movsxd r6, [r5+r6*4] 452 pcmpeqd m2, m2 453 pmaddubsw m0, m2 454 add r6, r5 455 add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table 456 movsxd wq, [r5+wq*4] 457 add wq, r5 458 jmp r6 459.h64: 460 movu m1, [tlq+48] ; unaligned when jumping here from dc_top 461 pmaddubsw m1, m2 462 paddw m0, m1 463 movu m1, [tlq+32] ; unaligned when jumping here from dc_top 464 pmaddubsw m1, m2 465 paddw m0, m1 466.h32: 467 movu m1, [tlq+16] ; unaligned when jumping here from dc_top 468 pmaddubsw m1, m2 469 paddw m0, m1 470.h16: 471 pshufd m1, m0, q3232 ; psrlq m1, m0, 16 472 paddw m0, m1 473.h8: 474 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 475 paddw m0, m1 476.h4: 477 pmaddwd m0, m2 478 pmulhrsw m0, m3 479 lea stride3q, [strideq*3] 480 pxor m1, m1 481 pshufb m0, m1 482 mova m1, m0 483 mova m2, m0 484 mova m3, m0 485 jmp wq 486 487;--------------------------------------------------------------------------------------- 488;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 489; const int width, const int height, const int a); 490;--------------------------------------------------------------------------------------- 491cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 492 LEA r5, ipred_dc_splat_ssse3_table 493 tzcnt wd, wm 494 movifnidn hd, hm 495 movsxd wq, [r5+wq*4] 496 movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128] 497 mova m1, m0 498 mova m2, m0 499 mova m3, m0 500 add wq, r5 501 lea stride3q, [strideq*3] 502 jmp wq 503 504;--------------------------------------------------------------------------------------- 505;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 506; const int width, const int height, const int a); 507;--------------------------------------------------------------------------------------- 508cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h 509 LEA r5, ipred_dc_left_ssse3_table 510 tzcnt wd, wm 511 inc tlq 512 movu m0, [tlq] 513 movifnidn hd, hm 514 movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] 515 movd m2, wd 516 psrld m3, m2 517 movsxd r6, [r5+wq*4] 518 pcmpeqd m2, m2 519 pmaddubsw m0, m2 520 add r6, r5 521 add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table 522 movsxd wq, [r5+wq*4] 523 add wq, r5 524 jmp r6 525 526;--------------------------------------------------------------------------------------- 527;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 528; const int width, const int height, const int a); 529;--------------------------------------------------------------------------------------- 530%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] 531 ; w * a = (w - 128) * a + 128 * a 532 ; (256 - w) * b = (127 - w) * b + 129 * b 533 ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b] 534 pmaddubsw m6, m%3, m%1 535 pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b 536 paddw m6, m%5 537 paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128] 538 psrlw m6, 8 539 psrlw m0, 8 540 packuswb m6, m0 541%endmacro 542 543cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights 544%define base r6-ipred_smooth_v_ssse3_table 545 LEA r6, ipred_smooth_v_ssse3_table 546 tzcnt wd, wm 547 mov hd, hm 548 movsxd wq, [r6+wq*4] 549 movddup m0, [base+pb_127_m127] 550 movddup m1, [base+pw_128] 551 lea weightsq, [base+smooth_weights+hq*4] 552 neg hq 553 movd m5, [tlq+hq] 554 pxor m2, m2 555 pshufb m5, m2 556 add wq, r6 557 jmp wq 558.w4: 559 movd m2, [tlq+1] 560 punpckldq m2, m2 561 punpcklbw m2, m5 ; top, bottom 562 lea r3, [strideq*3] 563 mova m4, [base+ipred_v_shuf] 564 mova m5, m4 565 punpckldq m4, m4 566 punpckhdq m5, m5 567 pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom 568 paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok 569 paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128 570.w4_loop: 571 movu m1, [weightsq+hq*2] 572 pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop 573 pshufb m1, m5 574 SMOOTH 0, 1, 2, 2, 3, 3 575 movd [dstq+strideq*0], m6 576 pshuflw m1, m6, q1032 577 movd [dstq+strideq*1], m1 578 punpckhqdq m6, m6 579 movd [dstq+strideq*2], m6 580 psrlq m6, 32 581 movd [dstq+r3 ], m6 582 lea dstq, [dstq+strideq*4] 583 add hq, 4 584 jl .w4_loop 585 RET 586ALIGN function_align 587.w8: 588 movq m2, [tlq+1] 589 punpcklbw m2, m5 590 mova m5, [base+ipred_v_shuf] 591 lea r3, [strideq*3] 592 pshufd m4, m5, q0000 593 pshufd m5, m5, q1111 594 pmaddubsw m3, m2, m0 595 paddw m1, m2 596 paddw m3, m1 ; m3 is output for loop 597.w8_loop: 598 movq m1, [weightsq+hq*2] 599 pshufb m0, m1, m4 600 pshufb m1, m5 601 SMOOTH 0, 1, 2, 2, 3, 3 602 movq [dstq+strideq*0], m6 603 movhps [dstq+strideq*1], m6 604 lea dstq, [dstq+strideq*2] 605 add hq, 2 606 jl .w8_loop 607 RET 608ALIGN function_align 609.w16: 610 movu m3, [tlq+1] 611 punpcklbw m2, m3, m5 612 punpckhbw m3, m5 613 pmaddubsw m4, m2, m0 614 pmaddubsw m5, m3, m0 615 paddw m0, m1, m2 616 paddw m1, m3 617 paddw m4, m0 618 paddw m5, m1 ; m4 and m5 is output for loop 619.w16_loop: 620 movd m1, [weightsq+hq*2] 621 pshuflw m1, m1, q0000 622 punpcklqdq m1, m1 623 SMOOTH 1, 1, 2, 3, 4, 5 624 mova [dstq], m6 625 add dstq, strideq 626 add hq, 1 627 jl .w16_loop 628 RET 629ALIGN function_align 630.w32: 631%if WIN64 632 movaps [rsp+24], xmm7 633 %define xmm_regs_used 8 634%endif 635 mova m7, m5 636.w32_loop_init: 637 mov r3d, 2 638.w32_loop: 639 movddup m0, [base+pb_127_m127] 640 movddup m1, [base+pw_128] 641 movu m3, [tlq+1] 642 punpcklbw m2, m3, m7 643 punpckhbw m3, m7 644 pmaddubsw m4, m2, m0 645 pmaddubsw m5, m3, m0 646 paddw m0, m1, m2 647 paddw m1, m3 648 paddw m4, m0 649 paddw m5, m1 650 movd m1, [weightsq+hq*2] 651 pshuflw m1, m1, q0000 652 punpcklqdq m1, m1 653 SMOOTH 1, 1, 2, 3, 4, 5 654 mova [dstq], m6 655 add tlq, 16 656 add dstq, 16 657 dec r3d 658 jg .w32_loop 659 lea dstq, [dstq-32+strideq] 660 sub tlq, 32 661 add hq, 1 662 jl .w32_loop_init 663 RET 664ALIGN function_align 665.w64: 666%if WIN64 667 movaps [rsp+24], xmm7 668 %define xmm_regs_used 8 669%endif 670 mova m7, m5 671.w64_loop_init: 672 mov r3d, 4 673.w64_loop: 674 movddup m0, [base+pb_127_m127] 675 movddup m1, [base+pw_128] 676 movu m3, [tlq+1] 677 punpcklbw m2, m3, m7 678 punpckhbw m3, m7 679 pmaddubsw m4, m2, m0 680 pmaddubsw m5, m3, m0 681 paddw m0, m1, m2 682 paddw m1, m3 683 paddw m4, m0 684 paddw m5, m1 685 movd m1, [weightsq+hq*2] 686 pshuflw m1, m1, q0000 687 punpcklqdq m1, m1 688 SMOOTH 1, 1, 2, 3, 4, 5 689 mova [dstq], m6 690 add tlq, 16 691 add dstq, 16 692 dec r3d 693 jg .w64_loop 694 lea dstq, [dstq-64+strideq] 695 sub tlq, 64 696 add hq, 1 697 jl .w64_loop_init 698 RET 699 700;--------------------------------------------------------------------------------------- 701;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 702; const int width, const int height, const int a); 703;--------------------------------------------------------------------------------------- 704cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h 705%define base r6-ipred_smooth_h_ssse3_table 706 LEA r6, ipred_smooth_h_ssse3_table 707 mov wd, wm 708 movd m3, [tlq+wq] 709 pxor m1, m1 710 pshufb m3, m1 ; right 711 tzcnt wd, wd 712 mov hd, hm 713 movsxd wq, [r6+wq*4] 714 movddup m4, [base+pb_127_m127] 715 movddup m5, [base+pw_128] 716 add wq, r6 717 jmp wq 718.w4: 719 movddup m6, [base+smooth_weights+4*2] 720 mova m7, [base+ipred_h_shuf] 721 sub tlq, 4 722 sub tlq, hq 723 lea r3, [strideq*3] 724.w4_loop: 725 movd m2, [tlq+hq] ; left 726 pshufb m2, m7 727 punpcklbw m1, m2, m3 ; left, right 728 punpckhbw m2, m3 729 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 730 paddw m0, m1 ; 128 * left + 129 * right 731 pmaddubsw m1, m6 732 paddw m1, m5 733 paddw m0, m1 734 pmaddubsw m1, m2, m4 735 paddw m1, m2 736 pmaddubsw m2, m6 737 paddw m2, m5 738 paddw m1, m2 739 psrlw m0, 8 740 psrlw m1, 8 741 packuswb m0, m1 742 movd [dstq+strideq*0], m0 743 pshuflw m1, m0, q1032 744 movd [dstq+strideq*1], m1 745 punpckhqdq m0, m0 746 movd [dstq+strideq*2], m0 747 psrlq m0, 32 748 movd [dstq+r3 ], m0 749 lea dstq, [dstq+strideq*4] 750 sub hd, 4 751 jg .w4_loop 752 RET 753ALIGN function_align 754.w8: 755 mova m6, [base+smooth_weights+8*2] 756 mova m7, [base+ipred_h_shuf] 757 sub tlq, 4 758 sub tlq, hq 759 punpckldq m7, m7 760.w8_loop: 761 movd m2, [tlq+hq] ; left 762 pshufb m2, m7 763 punpcklbw m1, m2, m3 ; left, right 764 punpckhbw m2, m3 765 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 766 paddw m0, m1 ; 128 * left + 129 * right 767 pmaddubsw m1, m6 768 paddw m1, m5 769 paddw m0, m1 770 pmaddubsw m1, m2, m4 771 paddw m1, m2 772 pmaddubsw m2, m6 773 paddw m2, m5 774 paddw m1, m2 775 psrlw m0, 8 776 psrlw m1, 8 777 packuswb m0, m1 778 movq [dstq+strideq*0], m0 779 movhps [dstq+strideq*1], m0 780 lea dstq, [dstq+strideq*2] 781 sub hd, 2 782 jg .w8_loop 783 RET 784ALIGN function_align 785.w16: 786 mova m6, [base+smooth_weights+16*2] 787 mova m7, [base+smooth_weights+16*3] 788 sub tlq, 1 789 sub tlq, hq 790.w16_loop: 791 pxor m1, m1 792 movd m2, [tlq+hq] ; left 793 pshufb m2, m1 794 punpcklbw m1, m2, m3 ; left, right 795 punpckhbw m2, m3 796 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 797 paddw m0, m1 ; 128 * left + 129 * right 798 pmaddubsw m1, m6 799 paddw m1, m5 800 paddw m0, m1 801 pmaddubsw m1, m2, m4 802 paddw m1, m2 803 pmaddubsw m2, m7 804 paddw m2, m5 805 paddw m1, m2 806 psrlw m0, 8 807 psrlw m1, 8 808 packuswb m0, m1 809 mova [dstq], m0 810 lea dstq, [dstq+strideq] 811 sub hd, 1 812 jg .w16_loop 813 RET 814ALIGN function_align 815.w32: 816 sub tlq, 1 817 sub tlq, hq 818 pxor m6, m6 819.w32_loop_init: 820 mov r5, 2 821 lea r3, [base+smooth_weights+16*4] 822.w32_loop: 823 mova m7, [r3] 824 add r3, 16 825 movd m2, [tlq+hq] ; left 826 pshufb m2, m6 827 punpcklbw m1, m2, m3 ; left, right 828 punpckhbw m2, m3 829 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 830 paddw m0, m1 ; 128 * left + 129 * right 831 pmaddubsw m1, m7 832 paddw m1, m5 833 paddw m0, m1 834 pmaddubsw m1, m2, m4 835 paddw m1, m2 836 mova m7, [r3] 837 add r3, 16 838 pmaddubsw m2, m7 839 paddw m2, m5 840 paddw m1, m2 841 psrlw m0, 8 842 psrlw m1, 8 843 packuswb m0, m1 844 mova [dstq], m0 845 add dstq, 16 846 dec r5 847 jg .w32_loop 848 lea dstq, [dstq-32+strideq] 849 sub hd, 1 850 jg .w32_loop_init 851 RET 852ALIGN function_align 853.w64: 854 sub tlq, 1 855 sub tlq, hq 856 pxor m6, m6 857.w64_loop_init: 858 mov r5, 4 859 lea r3, [base+smooth_weights+16*8] 860.w64_loop: 861 mova m7, [r3] 862 add r3, 16 863 movd m2, [tlq+hq] ; left 864 pshufb m2, m6 865 punpcklbw m1, m2, m3 ; left, right 866 punpckhbw m2, m3 867 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 868 paddw m0, m1 ; 128 * left + 129 * right 869 pmaddubsw m1, m7 870 paddw m1, m5 871 paddw m0, m1 872 pmaddubsw m1, m2, m4 873 paddw m1, m2 874 mova m7, [r3] 875 add r3, 16 876 pmaddubsw m2, m7 877 paddw m2, m5 878 paddw m1, m2 879 psrlw m0, 8 880 psrlw m1, 8 881 packuswb m0, m1 882 mova [dstq], m0 883 add dstq, 16 884 dec r5 885 jg .w64_loop 886 lea dstq, [dstq-64+strideq] 887 sub hd, 1 888 jg .w64_loop_init 889 RET 890 891;--------------------------------------------------------------------------------------- 892;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 893; const int width, const int height, const int a); 894;--------------------------------------------------------------------------------------- 895%macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3 896 pmaddubsw m6, m%3, m%1 897 mova m0, m6 898 pmaddubsw m6, m%4, m%2 899 mova m1, m6 900%ifnum %5 901 paddw m0, m%5 902%else 903 paddw m0, %5 904%endif 905%ifnum %6 906 paddw m1, m%6 907%else 908 paddw m1, %6 909%endif 910%ifnum %7 911%else 912 mova m3, %7 913%endif 914 pavgw m0, m2 915 pavgw m1, m3 916 psrlw m0, 8 917 psrlw m1, 8 918 packuswb m0, m1 919%endmacro 920 921%macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5] 922 mova m1, [rsp+16*%1] ; top 923 punpckhbw m6, m1, m0 ; top, bottom 924 punpcklbw m1, m0 ; top, bottom 925 pmaddubsw m2, m1, m5 926 mova [rsp+16*%2], m1 927 paddw m1, m3 ; 1 * top + 255 * bottom + 255 928 paddw m2, m1 ; 128 * top + 129 * bottom + 255 929 mova [rsp+16*%3], m2 930 pmaddubsw m2, m6, m5 931 mova [rsp+16*%4], m6 932 paddw m6, m3 ; 1 * top + 255 * bottom + 255 933 paddw m2, m6 ; 128 * top + 129 * bottom + 255 934 mova [rsp+16*%5], m2 935 movd m1, [tlq+hq] ; left 936 pshufb m1, [base+pb_3] ; topleft[-(1 + y)] 937 punpcklbw m1, m4 ; left, right 938 pmaddubsw m2, m1, m5 ; 127 * left - 127 * right 939 paddw m2, m1 ; 128 * left + 129 * right 940 mova m3, m2 941 pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width]; 942 pmaddubsw m1, %7 943 paddw m2, m3, m0 944 paddw m3, m1 945 movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; 946 mova m7, [rsp+16*%9] 947 pshufb m1, m7 948 mova [rsp+16*%8], m3 949 mova m4, [rsp+16*%2] 950 mova m5, [rsp+16*%3] 951 mova m3, [rsp+16*%4] 952 mova m7, [rsp+16*%5] 953 SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8] 954 mova [dstq], m0 955 movddup m3, [base+pw_255] ; recovery 956 mova m0, [rsp+16*%10] ; recovery 957 mova m4, [rsp+16*%11] ; recovery 958 mova m5, [rsp+16*%12] ; recovery 959%endmacro 960 961cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights 962%define base r6-ipred_smooth_ssse3_table 963 mov wd, wm 964 mov hd, hm 965 LEA r6, ipred_smooth_ssse3_table 966 movd m4, [tlq+wq] ; right 967 pxor m2, m2 968 pshufb m4, m2 969 tzcnt wd, wd 970 mov r5, tlq 971 sub r5, hq 972 movsxd wq, [r6+wq*4] 973 movddup m5, [base+pb_127_m127] 974 movd m0, [r5] 975 pshufb m0, m2 ; bottom 976 movddup m3, [base+pw_255] 977 add wq, r6 978 lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height] 979 jmp wq 980.w4: 981 mova m7, [base+ipred_v_shuf] 982 movd m1, [tlq+1] ; left 983 pshufd m1, m1, q0000 984 sub tlq, 4 985 lea r3, [strideq*3] 986 sub tlq, hq 987 punpcklbw m1, m0 ; top, bottom 988 pshufd m6, m7, q1100 989 pshufd m7, m7, q3322 990 pmaddubsw m2, m1, m5 991 paddw m3, m1 ; 1 * top + 255 * bottom + 255 992 paddw m2, m3 ; 128 * top + 129 * bottom + 255 993 mova [rsp+16*0], m1 994 mova [rsp+16*1], m2 995 movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width]; 996 punpcklqdq m1, m1 997 mova [rsp+16*2], m1 998 mova [rsp+16*3], m4 999 mova [rsp+16*4], m6 1000 mova [rsp+16*5], m5 1001.w4_loop: 1002 movd m1, [tlq+hq] ; left 1003 pshufb m1, [base+ipred_h_shuf] 1004 punpcklbw m0, m1, m4 ; left, right 1005 punpckhbw m1, m4 1006 pmaddubsw m2, m0, m5 ; 127 * left - 127 * right 1007 pmaddubsw m3, m1, m5 1008 paddw m2, m0 ; 128 * left + 129 * right 1009 paddw m3, m1 1010 mova m4, [rsp+16*2] 1011 pmaddubsw m0, m4 1012 pmaddubsw m1, m4 1013 paddw m2, m0 1014 paddw m3, m1 1015 movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; 1016 add v_weightsq, 8 1017 pshufb m0, m1, m6 1018 pshufb m1, m7 1019 mova m4, [rsp+16*0] 1020 mova m5, [rsp+16*1] 1021 SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 1022 mova m4, [rsp+16*3] 1023 mova m6, [rsp+16*4] 1024 mova m5, [rsp+16*5] 1025 movd [dstq+strideq*0], m0 1026 pshuflw m1, m0, q1032 1027 movd [dstq+strideq*1], m1 1028 punpckhqdq m0, m0 1029 movd [dstq+strideq*2], m0 1030 psrlq m0, 32 1031 movd [dstq+r3 ], m0 1032 lea dstq, [dstq+strideq*4] 1033 sub hd, 4 1034 jg .w4_loop 1035 RET 1036ALIGN function_align 1037.w8: 1038 mova m7, [base+ipred_v_shuf] 1039 movq m1, [tlq+1] ; left 1040 punpcklqdq m1, m1 1041 sub tlq, 4 1042 sub tlq, hq 1043 punpcklbw m1, m0 1044 pshufd m6, m7, q0000 1045 pshufd m7, m7, q1111 1046 pmaddubsw m2, m1, m5 1047 paddw m3, m1 1048 paddw m2, m3 1049 mova [rsp+16*0], m1 1050 mova [rsp+16*1], m2 1051 mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width]; 1052 mova [rsp+16*2], m1 1053 mova [rsp+16*3], m4 1054 mova [rsp+16*4], m6 1055 mova [rsp+16*5], m5 1056.w8_loop: 1057 movd m1, [tlq+hq] ; left 1058 pshufb m1, [base+ipred_h_shuf] 1059 pshufd m1, m1, q1100 1060 punpcklbw m0, m1, m4 1061 punpckhbw m1, m4 1062 pmaddubsw m2, m0, m5 1063 pmaddubsw m3, m1, m5 1064 paddw m2, m0 1065 paddw m3, m1 1066 mova m4, [rsp+16*2] 1067 pmaddubsw m0, m4 1068 pmaddubsw m1, m4 1069 paddw m2, m0 1070 paddw m3, m1 1071 movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; 1072 add v_weightsq, 4 1073 pshufb m0, m1, m6 1074 pshufb m1, m7 1075 mova m4, [rsp+16*0] 1076 mova m5, [rsp+16*1] 1077 SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 1078 mova m4, [rsp+16*3] 1079 mova m6, [rsp+16*4] 1080 mova m5, [rsp+16*5] 1081 movq [dstq+strideq*0], m0 1082 movhps [dstq+strideq*1], m0 1083 lea dstq, [dstq+strideq*2] 1084 sub hd, 2 1085 jg .w8_loop 1086 RET 1087ALIGN function_align 1088.w16: 1089 mova m7, [base+ipred_v_shuf] 1090 movu m1, [tlq+1] ; left 1091 sub tlq, 4 1092 sub tlq, hq 1093 punpckhbw m6, m1, m0 ; top, bottom 1094 punpcklbw m1, m0 ; top, bottom 1095 pshufd m7, m7, q0000 1096 mova [rsp+16*2], m7 1097 pmaddubsw m2, m6, m5 1098 mova [rsp+16*5], m6 1099 paddw m6, m3 ; 1 * top + 255 * bottom + 255 1100 paddw m2, m6 ; 128 * top + 129 * bottom + 255 1101 mova [rsp+16*6], m2 1102 pmaddubsw m2, m1, m5 1103 paddw m3, m1 ; 1 * top + 255 * bottom + 255 1104 mova [rsp+16*0], m1 1105 paddw m2, m3 ; 128 * top + 129 * bottom + 255 1106 mova [rsp+16*1], m2 1107 mova [rsp+16*3], m4 1108 mova [rsp+16*4], m5 1109.w16_loop: 1110 movd m1, [tlq+hq] ; left 1111 pshufb m1, [base+pb_3] ; topleft[-(1 + y)] 1112 punpcklbw m1, m4 ; left, right 1113 pmaddubsw m2, m1, m5 ; 127 * left - 127 * right 1114 paddw m2, m1 ; 128 * left + 129 * right 1115 mova m0, m1 1116 mova m3, m2 1117 pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width]; 1118 pmaddubsw m1, [base+smooth_weights+16*3] 1119 paddw m2, m0 1120 paddw m3, m1 1121 movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; 1122 add v_weightsq, 2 1123 mova m7, [rsp+16*2] 1124 pshufb m1, m7 1125 mova [rsp+16*7], m3 1126 mova m4, [rsp+16*0] 1127 mova m5, [rsp+16*1] 1128 mova m3, [rsp+16*5] 1129 mova m7, [rsp+16*6] 1130 SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7] 1131 mova m4, [rsp+16*3] 1132 mova m5, [rsp+16*4] 1133 mova [dstq], m0 1134 lea dstq, [dstq+strideq] 1135 sub hd, 1 1136 jg .w16_loop 1137 RET 1138ALIGN function_align 1139.w32: 1140 movu m1, [tlq+1] ; top topleft[1 + x] 1141 movu m2, [tlq+17] ; top 1142 mova [rsp+16*0], m1 1143 mova [rsp+16*1], m2 1144 sub tlq, 4 1145 sub tlq, hq 1146 mova m7, [base+ipred_v_shuf] 1147 pshufd m7, m7, q0000 1148 mova [rsp+16*2], m7 1149 mova [rsp+16*3], m0 1150 mova [rsp+16*4], m4 1151 mova [rsp+16*5], m5 1152.w32_loop: 1153 SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5 1154 add dstq, 16 1155 SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5 1156 lea dstq, [dstq-16+strideq] 1157 add v_weightsq, 2 1158 sub hd, 1 1159 jg .w32_loop 1160 RET 1161ALIGN function_align 1162.w64: 1163 movu m1, [tlq+1] ; top topleft[1 + x] 1164 movu m2, [tlq+17] ; top 1165 mova [rsp+16*0], m1 1166 mova [rsp+16*1], m2 1167 movu m1, [tlq+33] ; top 1168 movu m2, [tlq+49] ; top 1169 mova [rsp+16*11], m1 1170 mova [rsp+16*12], m2 1171 sub tlq, 4 1172 sub tlq, hq 1173 mova m7, [base+ipred_v_shuf] 1174 pshufd m7, m7, q0000 1175 mova [rsp+16*2], m7 1176 mova [rsp+16*3], m0 1177 mova [rsp+16*4], m4 1178 mova [rsp+16*5], m5 1179.w64_loop: 1180 SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5 1181 add dstq, 16 1182 SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5 1183 add dstq, 16 1184 SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5 1185 add dstq, 16 1186 SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5 1187 lea dstq, [dstq-48+strideq] 1188 add v_weightsq, 2 1189 sub hd, 1 1190 jg .w64_loop 1191 RET 1192 1193;--------------------------------------------------------------------------------------- 1194;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal, 1195; const uint8_t *idx, const int w, const int h); 1196;--------------------------------------------------------------------------------------- 1197cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h 1198 mova m4, [palq] 1199 LEA r2, pal_pred_ssse3_table 1200 tzcnt wd, wm 1201 movifnidn hd, hm 1202 movsxd wq, [r2+wq*4] 1203 packuswb m4, m4 1204 add wq, r2 1205 lea r2, [strideq*3] 1206 jmp wq 1207.w4: 1208 pshufb m0, m4, [idxq] 1209 add idxq, 16 1210 movd [dstq ], m0 1211 pshuflw m1, m0, q1032 1212 movd [dstq+strideq ], m1 1213 punpckhqdq m0, m0 1214 movd [dstq+strideq*2], m0 1215 psrlq m0, 32 1216 movd [dstq+r2 ], m0 1217 lea dstq, [dstq+strideq*4] 1218 sub hd, 4 1219 jg .w4 1220 RET 1221ALIGN function_align 1222.w8: 1223 pshufb m0, m4, [idxq] 1224 pshufb m1, m4, [idxq+16] 1225 add idxq, 32 1226 movq [dstq ], m0 1227 movhps [dstq+strideq ], m0 1228 movq [dstq+strideq*2], m1 1229 movhps [dstq+r2 ], m1 1230 lea dstq, [dstq+strideq*4] 1231 sub hd, 4 1232 jg .w8 1233 RET 1234ALIGN function_align 1235.w16: 1236 pshufb m0, m4, [idxq] 1237 pshufb m1, m4, [idxq+16] 1238 pshufb m2, m4, [idxq+32] 1239 pshufb m3, m4, [idxq+48] 1240 add idxq, 64 1241 mova [dstq ], m0 1242 mova [dstq+strideq ], m1 1243 mova [dstq+strideq*2], m2 1244 mova [dstq+r2 ], m3 1245 lea dstq, [dstq+strideq*4] 1246 sub hd, 4 1247 jg .w16 1248 RET 1249ALIGN function_align 1250.w32: 1251 pshufb m0, m4, [idxq] 1252 pshufb m1, m4, [idxq+16] 1253 pshufb m2, m4, [idxq+32] 1254 pshufb m3, m4, [idxq+48] 1255 add idxq, 64 1256 mova [dstq ], m0 1257 mova [dstq+16 ], m1 1258 mova [dstq+strideq ], m2 1259 mova [dstq+strideq+16], m3 1260 lea dstq, [dstq+strideq*2] 1261 sub hd, 2 1262 jg .w32 1263 RET 1264ALIGN function_align 1265.w64: 1266 pshufb m0, m4, [idxq] 1267 pshufb m1, m4, [idxq+16] 1268 pshufb m2, m4, [idxq+32] 1269 pshufb m3, m4, [idxq+48] 1270 add idxq, 64 1271 mova [dstq ], m0 1272 mova [dstq+16], m1 1273 mova [dstq+32], m2 1274 mova [dstq+48], m3 1275 add dstq, strideq 1276 sub hd, 1 1277 jg .w64 1278 RET 1279 1280;--------------------------------------------------------------------------------------- 1281;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 1282; const int width, const int height, const int16_t *ac, const int alpha); 1283;--------------------------------------------------------------------------------------- 1284%macro IPRED_CFL 1 ; ac in, unpacked pixels out 1285 psignw m3, m%1, m1 1286 pabsw m%1, m%1 1287 pmulhrsw m%1, m2 1288 psignw m%1, m3 1289 paddw m%1, m0 1290%endmacro 1291 1292%if UNIX64 1293DECLARE_REG_TMP 7 1294%else 1295DECLARE_REG_TMP 5 1296%endif 1297 1298cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha 1299 movifnidn wd, wm 1300 movifnidn hd, hm 1301 tzcnt r6d, hd 1302 lea t0d, [wq+hq] 1303 movd m4, t0d 1304 tzcnt t0d, t0d 1305 movd m5, t0d 1306 LEA t0, ipred_cfl_ssse3_table 1307 tzcnt wd, wd 1308 movsxd r6, [t0+r6*4] 1309 movsxd wq, [t0+wq*4+16] 1310 pcmpeqd m3, m3 1311 psrlw m4, 1 1312 add r6, t0 1313 add wq, t0 1314 movifnidn acq, acmp 1315 jmp r6 1316.h4: 1317 movd m0, [tlq-4] 1318 pmaddubsw m0, m3 1319 jmp wq 1320.w4: 1321 movd m1, [tlq+1] 1322 pmaddubsw m1, m3 1323 psubw m0, m4 1324 paddw m0, m1 1325 pmaddwd m0, m3 1326 cmp hd, 4 1327 jg .w4_mul 1328 psrlw m0, 3 ; dc >>= ctz(width + height); 1329 jmp .w4_end 1330.w4_mul: 1331 punpckhqdq m1, m0, m0 1332 paddw m0, m1 1333 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 1334 paddw m0, m1 1335 psrlw m0, 2 1336 mov r6d, 0x5556 1337 mov r2d, 0x3334 1338 test hd, 8 1339 cmovz r6d, r2d 1340 movd m5, r6d 1341 pmulhuw m0, m5 1342.w4_end: 1343 pshuflw m0, m0, q0000 1344 punpcklqdq m0, m0 1345.s4: 1346 movd m1, alpham 1347 pshuflw m1, m1, q0000 1348 punpcklqdq m1, m1 1349 lea r6, [strideq*3] 1350 pabsw m2, m1 1351 psllw m2, 9 1352.s4_loop: 1353 mova m4, [acq] 1354 mova m5, [acq+16] 1355 IPRED_CFL 4 1356 IPRED_CFL 5 1357 packuswb m4, m5 1358 movd [dstq+strideq*0], m4 1359 pshuflw m4, m4, q1032 1360 movd [dstq+strideq*1], m4 1361 punpckhqdq m4, m4 1362 movd [dstq+strideq*2], m4 1363 psrlq m4, 32 1364 movd [dstq+r6 ], m4 1365 lea dstq, [dstq+strideq*4] 1366 add acq, 32 1367 sub hd, 4 1368 jg .s4_loop 1369 RET 1370ALIGN function_align 1371.h8: 1372 movq m0, [tlq-8] 1373 pmaddubsw m0, m3 1374 jmp wq 1375.w8: 1376 movq m1, [tlq+1] 1377 pmaddubsw m1, m3 1378 psubw m4, m0 1379 punpckhqdq m0, m0 1380 psubw m0, m4 1381 paddw m0, m1 1382 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 1383 paddw m0, m1 1384 pmaddwd m0, m3 1385 psrlw m0, m5 1386 cmp hd, 8 1387 je .w8_end 1388 mov r6d, 0x5556 1389 mov r2d, 0x3334 1390 cmp hd, 32 1391 cmovz r6d, r2d 1392 movd m1, r6d 1393 pmulhuw m0, m1 1394.w8_end: 1395 pshuflw m0, m0, q0000 1396 punpcklqdq m0, m0 1397.s8: 1398 movd m1, alpham 1399 pshuflw m1, m1, q0000 1400 punpcklqdq m1, m1 1401 lea r6, [strideq*3] 1402 pabsw m2, m1 1403 psllw m2, 9 1404.s8_loop: 1405 mova m4, [acq] 1406 mova m5, [acq+16] 1407 IPRED_CFL 4 1408 IPRED_CFL 5 1409 packuswb m4, m5 1410 movq [dstq ], m4 1411 movhps [dstq+strideq ], m4 1412 mova m4, [acq+32] 1413 mova m5, [acq+48] 1414 IPRED_CFL 4 1415 IPRED_CFL 5 1416 packuswb m4, m5 1417 movq [dstq+strideq*2], m4 1418 movhps [dstq+r6 ], m4 1419 lea dstq, [dstq+strideq*4] 1420 add acq, 64 1421 sub hd, 4 1422 jg .s8_loop 1423 RET 1424ALIGN function_align 1425.h16: 1426 mova m0, [tlq-16] 1427 pmaddubsw m0, m3 1428 jmp wq 1429.w16: 1430 movu m1, [tlq+1] 1431 pmaddubsw m1, m3 1432 paddw m0, m1 1433 psubw m4, m0 1434 punpckhqdq m0, m0 1435 psubw m0, m4 1436 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 1437 paddw m0, m1 1438 pmaddwd m0, m3 1439 psrlw m0, m5 1440 cmp hd, 16 1441 je .w16_end 1442 mov r6d, 0x5556 1443 mov r2d, 0x3334 1444 test hd, 8|32 1445 cmovz r6d, r2d 1446 movd m1, r6d 1447 pmulhuw m0, m1 1448.w16_end: 1449 pshuflw m0, m0, q0000 1450 punpcklqdq m0, m0 1451.s16: 1452 movd m1, alpham 1453 pshuflw m1, m1, q0000 1454 punpcklqdq m1, m1 1455 pabsw m2, m1 1456 psllw m2, 9 1457.s16_loop: 1458 mova m4, [acq] 1459 mova m5, [acq+16] 1460 IPRED_CFL 4 1461 IPRED_CFL 5 1462 packuswb m4, m5 1463 mova [dstq], m4 1464 mova m4, [acq+32] 1465 mova m5, [acq+48] 1466 IPRED_CFL 4 1467 IPRED_CFL 5 1468 packuswb m4, m5 1469 mova [dstq+strideq], m4 1470 lea dstq, [dstq+strideq*2] 1471 add acq, 64 1472 sub hd, 2 1473 jg .s16_loop 1474 RET 1475ALIGN function_align 1476.h32: 1477 mova m0, [tlq-32] 1478 pmaddubsw m0, m3 1479 mova m2, [tlq-16] 1480 pmaddubsw m2, m3 1481 paddw m0, m2 1482 jmp wq 1483.w32: 1484 movu m1, [tlq+1] 1485 pmaddubsw m1, m3 1486 movu m2, [tlq+17] 1487 pmaddubsw m2, m3 1488 paddw m1, m2 1489 paddw m0, m1 1490 psubw m4, m0 1491 punpckhqdq m0, m0 1492 psubw m0, m4 1493 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 1494 paddw m0, m1 1495 pmaddwd m0, m3 1496 psrlw m0, m5 1497 cmp hd, 32 1498 je .w32_end 1499 lea r2d, [hq*2] 1500 mov r6d, 0x5556 1501 mov r2d, 0x3334 1502 test hd, 64|16 1503 cmovz r6d, r2d 1504 movd m1, r6d 1505 pmulhuw m0, m1 1506.w32_end: 1507 pshuflw m0, m0, q0000 1508 punpcklqdq m0, m0 1509.s32: 1510 movd m1, alpham 1511 pshuflw m1, m1, q0000 1512 punpcklqdq m1, m1 1513 pabsw m2, m1 1514 psllw m2, 9 1515.s32_loop: 1516 mova m4, [acq] 1517 mova m5, [acq+16] 1518 IPRED_CFL 4 1519 IPRED_CFL 5 1520 packuswb m4, m5 1521 mova [dstq], m4 1522 mova m4, [acq+32] 1523 mova m5, [acq+48] 1524 IPRED_CFL 4 1525 IPRED_CFL 5 1526 packuswb m4, m5 1527 mova [dstq+16], m4 1528 add dstq, strideq 1529 add acq, 64 1530 dec hd 1531 jg .s32_loop 1532 RET 1533 1534;--------------------------------------------------------------------------------------- 1535;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 1536; const int width, const int height, const int16_t *ac, const int alpha); 1537;--------------------------------------------------------------------------------------- 1538cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha 1539 mov hd, hm ; zero upper half 1540 tzcnt r6d, hd 1541 sub tlq, hq 1542 tzcnt wd, wm 1543 movu m0, [tlq] 1544 mov t0d, 0x8000 1545 movd m3, t0d 1546 movd m2, r6d 1547 psrld m3, m2 1548 LEA t0, ipred_cfl_left_ssse3_table 1549 movsxd r6, [t0+r6*4] 1550 pcmpeqd m2, m2 1551 pmaddubsw m0, m2 1552 add r6, t0 1553 add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table 1554 movsxd wq, [t0+wq*4] 1555 add wq, t0 1556 movifnidn acq, acmp 1557 jmp r6 1558.h32: 1559 movu m1, [tlq+16] ; unaligned when jumping here from dc_top 1560 pmaddubsw m1, m2 1561 paddw m0, m1 1562.h16: 1563 pshufd m1, m0, q3232 ; psrlq m1, m0, 16 1564 paddw m0, m1 1565.h8: 1566 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 1567 paddw m0, m1 1568.h4: 1569 pmaddwd m0, m2 1570 pmulhrsw m0, m3 1571 pshuflw m0, m0, q0000 1572 punpcklqdq m0, m0 1573 jmp wq 1574 1575;--------------------------------------------------------------------------------------- 1576;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 1577; const int width, const int height, const int16_t *ac, const int alpha); 1578;--------------------------------------------------------------------------------------- 1579cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha 1580 LEA t0, ipred_cfl_left_ssse3_table 1581 tzcnt wd, wm 1582 inc tlq 1583 movu m0, [tlq] 1584 movifnidn hd, hm 1585 mov r6d, 0x8000 1586 movd m3, r6d 1587 movd m2, wd 1588 psrld m3, m2 1589 movsxd r6, [t0+wq*4] 1590 pcmpeqd m2, m2 1591 pmaddubsw m0, m2 1592 add r6, t0 1593 add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table 1594 movsxd wq, [t0+wq*4] 1595 add wq, t0 1596 movifnidn acq, acmp 1597 jmp r6 1598 1599;--------------------------------------------------------------------------------------- 1600;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 1601; const int width, const int height, const int16_t *ac, const int alpha); 1602;--------------------------------------------------------------------------------------- 1603cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha 1604 tzcnt wd, wm 1605 movifnidn hd, hm 1606 LEA r6, ipred_cfl_splat_ssse3_table 1607 movsxd wq, [r6+wq*4] 1608 movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128] 1609 add wq, r6 1610 movifnidn acq, acmp 1611 jmp wq 1612 1613%macro RELOAD_ACQ_32 1 1614 mov acq, ac_bakq ; restore acq 1615%endmacro 1616 1617%if ARCH_X86_64 1618cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak 1619DECLARE_REG_TMP 7 1620 movddup m2, [pb_2] 1621%else 1622cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h 1623DECLARE_REG_TMP 4 1624%define ac_bakq acmp 1625 mov t0d, 0x02020202 1626 movd m2, t0d 1627 pshufd m2, m2, q0000 1628%endif 1629 movifnidn wd, wm 1630 mov t0d, hm 1631 mov hd, t0d 1632 imul t0d, wd 1633 movd m5, t0d 1634 movifnidn hpadd, hpadm 1635%if ARCH_X86_64 1636 mov ac_bakq, acq 1637%endif 1638 shl hpadd, 2 1639 sub hd, hpadd 1640 pxor m4, m4 1641 cmp wd, 8 1642 jg .w16 1643 je .w8 1644 ; fall-through 1645%if ARCH_X86_64 1646 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak 1647%else 1648 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h 1649%endif 1650.w4: 1651 lea stride3q, [strideq*3] 1652.w4_loop: 1653 movq m0, [yq] 1654 movq m1, [yq+strideq] 1655 movhps m0, [yq+strideq*2] 1656 movhps m1, [yq+stride3q] 1657 pmaddubsw m0, m2 1658 pmaddubsw m1, m2 1659 paddw m0, m1 1660 mova [acq], m0 1661 paddw m4, m0 1662 lea yq, [yq+strideq*4] 1663 add acq, 16 1664 sub hd, 2 1665 jg .w4_loop 1666 test hpadd, hpadd 1667 jz .calc_avg_4_8 1668 punpckhqdq m0, m0 1669.w4_hpad_loop: 1670 mova [acq], m0 1671 paddw m4, m0 1672 add acq, 16 1673 sub hpadd, 2 1674 jg .w4_hpad_loop 1675 jmp .calc_avg_4_8 1676.w8: 1677 lea stride3q, [strideq*3] 1678 test wpadd, wpadd 1679 jnz .w8_wpad 1680.w8_loop: 1681 mova m0, [yq] 1682 mova m1, [yq+strideq] 1683 pmaddubsw m0, m2 1684 pmaddubsw m1, m2 1685 paddw m0, m1 1686 mova [acq], m0 1687 paddw m4, m0 1688 mova m0, [yq+strideq*2] 1689 mova m1, [yq+stride3q] 1690 pmaddubsw m0, m2 1691 pmaddubsw m1, m2 1692 paddw m0, m1 1693 mova [acq+16], m0 1694 paddw m4, m0 1695 lea yq, [yq+strideq*4] 1696 add acq, 32 1697 sub hd, 2 1698 jg .w8_loop 1699 test hpadd, hpadd 1700 jz .calc_avg_4_8 1701 jmp .w8_hpad 1702.w8_wpad: ; wpadd=1 1703 movddup m0, [yq] 1704 movddup m1, [yq+strideq] 1705 pmaddubsw m0, m2 1706 pmaddubsw m1, m2 1707 paddw m0, m1 1708 pshufhw m0, m0, q3333 1709 mova [acq], m0 1710 paddw m4, m0 1711 lea yq, [yq+strideq*2] 1712 add acq, 16 1713 sub hd, 1 1714 jg .w8_wpad 1715 test hpadd, hpadd 1716 jz .calc_avg_4_8 1717.w8_hpad: 1718 mova [acq], m0 1719 paddw m4, m0 1720 add acq, 16 1721 sub hpadd, 1 1722 jg .w8_hpad 1723 jmp .calc_avg_4_8 1724.w16: 1725 test wpadd, wpadd 1726 jnz .w16_wpad 1727.w16_loop: 1728 mova m0, [yq] 1729 mova m1, [yq+strideq] 1730 pmaddubsw m0, m2 1731 pmaddubsw m1, m2 1732 paddw m0, m1 1733 mova [acq], m0 1734 paddw m4, m0 1735 mova m6, [yq+16] 1736 mova m1, [yq+strideq+16] 1737 pmaddubsw m6, m2 1738 pmaddubsw m1, m2 1739 paddw m6, m1 1740 mova [acq+16], m6 1741 paddw m4, m6 1742 lea yq, [yq+strideq*2] 1743 add acq, 32 1744 dec hd 1745 jg .w16_loop 1746 test hpadd, hpadd 1747 jz .calc_avg16 1748 jmp .w16_hpad_loop 1749.w16_wpad: 1750 cmp wpadd, 2 1751 jl .w16_pad1 1752 je .w16_pad2 1753.w16_pad3: 1754 movddup m0, [yq] 1755 movddup m1, [yq+strideq] 1756 pmaddubsw m0, m2 1757 pmaddubsw m1, m2 1758 paddw m0, m1 1759 pshufhw m0, m0, q3333 1760 mova [acq], m0 1761 paddw m4, m0 1762 mova m6, m0 1763 punpckhqdq m6, m0, m0 1764 mova [acq+16], m6 1765 paddw m4, m6 1766 lea yq, [yq+strideq*2] 1767 add acq, 32 1768 dec hd 1769 jg .w16_pad3 1770 jmp .w16_wpad_done 1771.w16_pad2: 1772 mova m0, [yq] 1773 mova m1, [yq+strideq] 1774 pmaddubsw m0, m2 1775 pmaddubsw m1, m2 1776 paddw m0, m1 1777 mova [acq], m0 1778 paddw m4, m0 1779 pshufhw m6, m0, q3333 1780 punpckhqdq m6, m6 1781 mova [acq+16], m6 1782 paddw m4, m6 1783 lea yq, [yq+strideq*2] 1784 add acq, 32 1785 dec hd 1786 jg .w16_pad2 1787 jmp .w16_wpad_done 1788.w16_pad1: 1789 mova m0, [yq] 1790 mova m1, [yq+strideq] 1791 pmaddubsw m0, m2 1792 pmaddubsw m1, m2 1793 paddw m0, m1 1794 mova [acq], m0 1795 paddw m4, m0 1796 movddup m6, [yq+16] 1797 movddup m1, [yq+strideq+16] 1798 pmaddubsw m6, m2 1799 pmaddubsw m1, m2 1800 paddw m6, m1 1801 pshufhw m6, m6, q3333 1802 mova [acq+16], m6 1803 paddw m4, m6 1804 lea yq, [yq+strideq*2] 1805 add acq, 32 1806 dec hd 1807 jg .w16_pad1 1808.w16_wpad_done: 1809 test hpadd, hpadd 1810 jz .calc_avg16 1811.w16_hpad_loop: 1812 mova [acq], m0 1813 paddw m4, m0 1814 mova [acq+16], m6 1815 paddw m4, m6 1816 add acq, 32 1817 dec hpadd 1818 jg .w16_hpad_loop 1819 jmp .calc_avg16 1820 1821%if ARCH_X86_64 1822 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak 1823%else 1824 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h 1825%endif 1826.calc_avg_4_8: 1827 psrlw m2, 9 1828 pmaddwd m4, m2 1829 jmp .calc_avg 1830.calc_avg16: 1831 psrld m0, m4, 16 1832 pslld m4, 16 1833 psrld m4, 16 1834 paddd m4, m0 1835.calc_avg: 1836 movd szd, m5 1837 psrad m5, 1 1838 tzcnt r1d, szd 1839 paddd m4, m5 1840 movd m1, r1d 1841 pshufd m0, m4, q2301 1842 paddd m0, m4 1843 pshufd m4, m0, q1032 1844 paddd m0, m4 1845 psrad m0, m1 ; sum >>= log2sz; 1846 packssdw m0, m0 1847 RELOAD_ACQ_32 acq 1848.sub_loop: 1849 mova m1, [acq] 1850 psubw m1, m0 ; ac[x] -= sum; 1851 mova [acq], m1 1852 add acq, 16 1853 sub szd, 8 1854 jg .sub_loop 1855 RET 1856 1857%if ARCH_X86_64 1858cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak 1859 movddup m2, [pb_4] 1860%else 1861cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h 1862 mov t0d, 0x04040404 1863 movd m2, t0d 1864 pshufd m2, m2, q0000 1865%endif 1866 movifnidn wd, wm 1867 mov t0d, hm 1868 mov hd, t0d 1869 imul t0d, wd 1870 movd m6, t0d 1871 movifnidn hpadd, hpadm 1872%if ARCH_X86_64 1873 mov ac_bakq, acq 1874%endif 1875 shl hpadd, 2 1876 sub hd, hpadd 1877 pxor m4, m4 1878 pxor m5, m5 1879 cmp wd, 8 1880 jg .w16 1881 je .w8 1882 ; fall-through 1883 1884%if ARCH_X86_64 1885 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak 1886%else 1887 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h 1888%endif 1889.w4: 1890 lea stride3q, [strideq*3] 1891.w4_loop: 1892 movq m1, [yq] 1893 movhps m1, [yq+strideq] 1894 movq m0, [yq+strideq*2] 1895 movhps m0, [yq+stride3q] 1896 pmaddubsw m0, m2 1897 pmaddubsw m1, m2 1898 mova [acq], m1 1899 mova [acq+16], m0 1900 paddw m4, m0 1901 paddw m5, m1 1902 lea yq, [yq+strideq*4] 1903 add acq, 32 1904 sub hd, 4 1905 jg .w4_loop 1906 test hpadd, hpadd 1907 jz .calc_avg_4 1908 punpckhqdq m0, m0 1909.w4_hpad_loop: 1910 mova [acq], m0 1911 paddw m4, m0 1912 add acq, 16 1913 sub hpadd, 2 1914 jg .w4_hpad_loop 1915 jmp .calc_avg_4 1916.w8: 1917 lea stride3q, [strideq*3] 1918 test wpadd, wpadd 1919 jnz .w8_wpad 1920.w8_loop: 1921 mova m1, [yq] 1922 mova m0, [yq+strideq] 1923 pmaddubsw m0, m2 1924 pmaddubsw m1, m2 1925 mova [acq], m1 1926 mova [acq+16], m0 1927 paddw m4, m0 1928 paddw m5, m1 1929 mova m1, [yq+strideq*2] 1930 mova m0, [yq+stride3q] 1931 pmaddubsw m0, m2 1932 pmaddubsw m1, m2 1933 mova [acq+32], m1 1934 mova [acq+48], m0 1935 paddw m4, m0 1936 paddw m5, m1 1937 lea yq, [yq+strideq*4] 1938 add acq, 64 1939 sub hd, 4 1940 jg .w8_loop 1941 test hpadd, hpadd 1942 jz .calc_avg_8_16 1943 jmp .w8_hpad 1944.w8_wpad: 1945 movddup m1, [yq] 1946 pmaddubsw m1, m2 1947 pshufhw m1, m1, q3333 1948 mova [acq], m1 1949 paddw m5, m1 1950 movddup m0, [yq+strideq] 1951 pmaddubsw m0, m2 1952 pshufhw m0, m0, q3333 1953 mova [acq+16], m0 1954 paddw m4, m0 1955 lea yq, [yq+strideq*2] 1956 add acq, 32 1957 sub hd, 2 1958 jg .w8_wpad 1959 test hpadd, hpadd 1960 jz .calc_avg_8_16 1961.w8_hpad: 1962 mova [acq], m0 1963 paddw m4, m0 1964 mova [acq+16], m0 1965 paddw m4, m0 1966 add acq, 32 1967 sub hpadd, 2 1968 jg .w8_hpad 1969 jmp .calc_avg_8_16 1970.w16: 1971 test wpadd, wpadd 1972 jnz .w16_wpad 1973.w16_loop: 1974 mova m1, [yq] 1975 mova m0, [yq+16] 1976 pmaddubsw m0, m2 1977 pmaddubsw m1, m2 1978 mova [acq], m1 1979 mova [acq+16], m0 1980 paddw m5, m0 1981 paddw m5, m1 1982 mova m1, [yq+strideq] 1983 mova m0, [yq+strideq+16] 1984 pmaddubsw m0, m2 1985 pmaddubsw m1, m2 1986 mova [acq+32], m1 1987 mova [acq+48], m0 1988 paddw m4, m0 1989 paddw m4, m1 1990 lea yq, [yq+strideq*2] 1991 add acq, 64 1992 sub hd, 2 1993 jg .w16_loop 1994 test hpadd, hpadd 1995 jz .calc_avg_8_16 1996 jmp .w16_hpad_loop 1997.w16_wpad: 1998 cmp wpadd, 2 1999 jl .w16_pad1 2000 je .w16_pad2 2001.w16_pad3: 2002 movddup m1, [yq] 2003 pmaddubsw m1, m2 2004 pshufhw m1, m1, q3333 2005 mova [acq], m1 2006 paddw m5, m1 2007 punpckhqdq m1, m1 2008 mova [acq+16], m1 2009 paddw m5, m1 2010 movddup m1, [yq+strideq] 2011 pmaddubsw m1, m2 2012 pshufhw m1, m1, q3333 2013 mova [acq+32], m1 2014 paddw m4, m1 2015 punpckhqdq m0, m1, m1 2016 mova [acq+48], m0 2017 paddw m4, m0 2018 lea yq, [yq+strideq*2] 2019 add acq, 64 2020 sub hd, 2 2021 jg .w16_pad3 2022 jmp .w16_wpad_done 2023.w16_pad2: 2024 mova m1, [yq] 2025 pmaddubsw m1, m2 2026 mova [acq], m1 2027 paddw m5, m1 2028 pshufhw m1, m1, q3333 2029 punpckhqdq m1, m1 2030 mova [acq+16], m1 2031 paddw m5, m1 2032 mova m1, [yq+strideq] 2033 pmaddubsw m1, m2 2034 mova [acq+32], m1 2035 paddw m4, m1 2036 mova m0, m1 2037 pshufhw m0, m0, q3333 2038 punpckhqdq m0, m0 2039 mova [acq+48], m0 2040 paddw m4, m0 2041 lea yq, [yq+strideq*2] 2042 add acq, 64 2043 sub hd, 2 2044 jg .w16_pad2 2045 jmp .w16_wpad_done 2046.w16_pad1: 2047 mova m1, [yq] 2048 pmaddubsw m1, m2 2049 mova [acq], m1 2050 paddw m5, m1 2051 movddup m0, [yq+16] 2052 pmaddubsw m0, m2 2053 pshufhw m0, m0, q3333 2054 mova [acq+16], m0 2055 paddw m5, m0 2056 mova m1, [yq+strideq] 2057 pmaddubsw m1, m2 2058 mova [acq+32], m1 2059 paddw m4, m1 2060 movddup m0, [yq+strideq+16] 2061 pmaddubsw m0, m2 2062 pshufhw m0, m0, q3333 2063 mova [acq+48], m0 2064 paddw m4, m0 2065 lea yq, [yq+strideq*2] 2066 add acq, 64 2067 sub hd, 2 2068 jg .w16_pad1 2069.w16_wpad_done: 2070 test hpadd, hpadd 2071 jz .calc_avg_8_16 2072.w16_hpad_loop: 2073 mova [acq], m1 2074 mova [acq+16], m0 2075 paddw m4, m1 2076 paddw m5, m0 2077 mova [acq+32], m1 2078 mova [acq+48], m0 2079 paddw m4, m1 2080 paddw m5, m0 2081 add acq, 64 2082 sub hpadd, 2 2083 jg .w16_hpad_loop 2084 jmp .calc_avg_8_16 2085 2086%if ARCH_X86_64 2087 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak 2088%else 2089 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h 2090%endif 2091.calc_avg_4: 2092 psrlw m2, 10 2093 pmaddwd m5, m2 2094 pmaddwd m0, m4, m2 2095 jmp .calc_avg 2096.calc_avg_8_16: 2097 mova m0, m5 2098 psrld m5, 16 2099 pslld m0, 16 2100 psrld m0, 16 2101 paddd m5, m0 2102 mova m0, m4 2103 psrld m0, 16 2104 pslld m4, 16 2105 psrld m4, 16 2106 paddd m0, m4 2107.calc_avg: 2108 paddd m5, m0 2109 movd szd, m6 2110 psrad m6, 1 2111 tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); 2112 paddd m5, m6 2113 movd m1, r1d 2114 pshufd m0, m5, q2301 2115 paddd m0, m5 2116 pshufd m5, m0, q1032 2117 paddd m0, m5 2118 psrad m0, m1 ; sum >>= log2sz; 2119 packssdw m0, m0 2120 RELOAD_ACQ_32 acq ; ac = ac_orig 2121.sub_loop: 2122 mova m1, [acq] 2123 psubw m1, m0 2124 mova [acq], m1 2125 add acq, 16 2126 sub szd, 8 2127 jg .sub_loop 2128 RET 2129 2130%if ARCH_X86_64 2131cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak 2132 movddup m2, [pb_4] 2133%else 2134cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h 2135%define ac_bakq [rsp+16*4] 2136 mov t0d, 0x04040404 2137 movd m2, t0d 2138 pshufd m2, m2, q0000 2139%endif 2140 movifnidn wd, wm 2141 movifnidn hpadd, hpadm 2142 movd m0, hpadd 2143 mov t0d, hm 2144 mov hd, t0d 2145 imul t0d, wd 2146 movd m6, t0d 2147 movd hpadd, m0 2148 mov ac_bakq, acq 2149 shl hpadd, 2 2150 sub hd, hpadd 2151 pxor m5, m5 2152 pxor m4, m4 2153 cmp wd, 16 2154 jg .w32 2155 cmp wd, 8 2156 jg .w16 2157 je .w8 2158 ; fall-through 2159 2160%if ARCH_X86_64 2161 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak 2162%else 2163 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h 2164%endif 2165.w4: 2166 lea stride3q, [strideq*3] 2167.w4_loop: 2168 movd m1, [yq] 2169 movd m3, [yq+strideq] 2170 punpckldq m1, m3 2171 punpcklbw m1, m1 2172 movd m0, [yq+strideq*2] 2173 movd m3, [yq+stride3q] 2174 punpckldq m0, m3 2175 punpcklbw m0, m0 2176 pmaddubsw m1, m2 2177 pmaddubsw m0, m2 2178 mova [acq], m1 2179 mova [acq+16], m0 2180 paddw m5, m0 2181 paddw m5, m1 2182 lea yq, [yq+strideq*4] 2183 add acq, 32 2184 sub hd, 4 2185 jg .w4_loop 2186 test hpadd, hpadd 2187 jz .calc_avg_4 2188 punpckhqdq m0, m0 2189.w4_hpad_loop: 2190 mova [acq], m0 2191 paddw m5, m0 2192 add acq, 16 2193 sub hpadd, 2 2194 jg .w4_hpad_loop 2195.calc_avg_4: 2196 psrlw m2, 10 2197 pmaddwd m5, m2 2198 jmp .calc_avg 2199 2200.w8: 2201 lea stride3q, [strideq*3] 2202 test wpadd, wpadd 2203 jnz .w8_wpad 2204.w8_loop: 2205 movq m1, [yq] 2206 punpcklbw m1, m1 2207 pmaddubsw m1, m2 2208 mova [acq], m1 2209 paddw m5, m1 2210 movq m0, [yq+strideq] 2211 punpcklbw m0, m0 2212 pmaddubsw m0, m2 2213 mova [acq+16], m0 2214 paddw m5, m0 2215 movq m1, [yq+strideq*2] 2216 punpcklbw m1, m1 2217 pmaddubsw m1, m2 2218 mova [acq+32], m1 2219 paddw m4, m1 2220 movq m0, [yq+stride3q] 2221 punpcklbw m0, m0 2222 pmaddubsw m0, m2 2223 mova [acq+48], m0 2224 paddw m4, m0 2225 lea yq, [yq+strideq*4] 2226 add acq, 64 2227 sub hd, 4 2228 jg .w8_loop 2229 test hpadd, hpadd 2230 jz .calc_avg_8_16 2231 jmp .w8_hpad 2232.w8_wpad: 2233 movd m1, [yq] 2234 punpcklbw m1, m1 2235 punpcklqdq m1, m1 2236 pmaddubsw m1, m2 2237 pshufhw m1, m1, q3333 2238 mova [acq], m1 2239 paddw m5, m1 2240 movd m0, [yq+strideq] 2241 punpcklbw m0, m0 2242 punpcklqdq m0, m0 2243 pmaddubsw m0, m2 2244 pshufhw m0, m0, q3333 2245 mova [acq+16], m0 2246 paddw m4, m0 2247 lea yq, [yq+strideq*2] 2248 add acq, 32 2249 sub hd, 2 2250 jg .w8_wpad 2251 test hpadd, hpadd 2252 jz .calc_avg_8_16 2253.w8_hpad: 2254 mova [acq], m0 2255 paddw m5, m0 2256 mova [acq+16], m0 2257 paddw m4, m0 2258 add acq, 32 2259 sub hpadd, 2 2260 jg .w8_hpad 2261 jmp .calc_avg_8_16 2262 2263.w16: 2264 test wpadd, wpadd 2265 jnz .w16_wpad 2266.w16_loop: 2267 mova m0, [yq] 2268 mova m1, m0 2269 punpcklbw m1, m1 2270 pmaddubsw m1, m2 2271 mova [acq], m1 2272 paddw m5, m1 2273 punpckhbw m0, m0 2274 pmaddubsw m0, m2 2275 mova [acq+16], m0 2276 paddw m5, m0 2277 mova m0, [yq+strideq] 2278 mova m1, m0 2279 punpcklbw m1, m1 2280 pmaddubsw m1, m2 2281 mova [acq+32], m1 2282 paddw m4, m1 2283 punpckhbw m0, m0 2284 pmaddubsw m0, m2 2285 mova [acq+48], m0 2286 paddw m4, m0 2287 lea yq, [yq+strideq*2] 2288 add acq, 64 2289 sub hd, 2 2290 jg .w16_loop 2291 test hpadd, hpadd 2292 jz .calc_avg_8_16 2293 jmp .w16_hpad_loop 2294.w16_wpad: 2295 cmp wpadd, 2 2296 jl .w16_pad1 2297 je .w16_pad2 2298.w16_pad3: 2299 movd m1, [yq] 2300 punpcklbw m1, m1 2301 punpcklqdq m1, m1 2302 pshufhw m1, m1, q3333 2303 pmaddubsw m1, m2 2304 mova [acq], m1 2305 paddw m5, m1 2306 punpckhqdq m1, m1 2307 mova [acq+16], m1 2308 paddw m5, m1 2309 movd m1, [yq+strideq] 2310 punpcklbw m1, m1 2311 punpcklqdq m1, m1 2312 pshufhw m1, m1, q3333 2313 pmaddubsw m1, m2 2314 mova [acq+32], m1 2315 paddw m4, m1 2316 punpckhqdq m0, m1, m1 2317 mova [acq+48], m0 2318 paddw m4, m0 2319 lea yq, [yq+strideq*2] 2320 add acq, 64 2321 sub hd, 2 2322 jg .w16_pad3 2323 jmp .w16_wpad_done 2324.w16_pad2: 2325 movq m1, [yq] 2326 punpcklbw m1, m1 2327 pmaddubsw m1, m2 2328 mova [acq], m1 2329 paddw m5, m1 2330 pshufhw m1, m1, q3333 2331 punpckhqdq m1, m1 2332 mova [acq+16], m1 2333 paddw m5, m1 2334 movq m1, [yq+strideq] 2335 punpcklbw m1, m1 2336 pmaddubsw m1, m2 2337 mova [acq+32], m1 2338 paddw m4, m1 2339 mova m0, m1 2340 pshufhw m0, m0, q3333 2341 punpckhqdq m0, m0 2342 mova [acq+48], m0 2343 paddw m4, m0 2344 lea yq, [yq+strideq*2] 2345 add acq, 64 2346 sub hd, 2 2347 jg .w16_pad2 2348 jmp .w16_wpad_done 2349.w16_pad1: 2350 mova m0, [yq] 2351 mova m1, m0 2352 punpcklbw m1, m1 2353 pmaddubsw m1, m2 2354 mova [acq], m1 2355 paddw m5, m1 2356 punpckhbw m0, m0 2357 punpcklqdq m0, m0 2358 pshufhw m0, m0, q3333 2359 pmaddubsw m0, m2 2360 mova [acq+16], m0 2361 paddw m5, m0 2362 mova m0, [yq+strideq] 2363 mova m1, m0 2364 punpcklbw m1, m1 2365 pmaddubsw m1, m2 2366 mova [acq+32], m1 2367 paddw m4, m1 2368 punpckhbw m0, m0 2369 punpcklqdq m0, m0 2370 pshufhw m0, m0, q3333 2371 pmaddubsw m0, m2 2372 mova [acq+48], m0 2373 paddw m4, m0 2374 lea yq, [yq+strideq*2] 2375 add acq, 64 2376 sub hd, 2 2377 jg .w16_pad1 2378.w16_wpad_done: 2379 test hpadd, hpadd 2380 jz .calc_avg_8_16 2381.w16_hpad_loop: 2382 mova [acq], m1 2383 mova [acq+16], m0 2384 paddw m4, m1 2385 paddw m5, m0 2386 mova [acq+32], m1 2387 mova [acq+48], m0 2388 paddw m4, m1 2389 paddw m5, m0 2390 add acq, 64 2391 sub hpadd, 2 2392 jg .w16_hpad_loop 2393.calc_avg_8_16: 2394 mova m0, m5 2395 psrld m5, 16 2396 pslld m0, 16 2397 psrld m0, 16 2398 paddd m5, m0 2399 mova m0, m4 2400 psrld m0, 16 2401 pslld m4, 16 2402 psrld m4, 16 2403 paddd m0, m4 2404 paddd m5, m0 2405 jmp .calc_avg 2406 2407.w32: 2408 pxor m0, m0 2409 mova [rsp ], m0 2410 mova [rsp+16], m0 2411 mova [rsp+32], m0 2412 mova [rsp+48], m0 2413 test wpadd, wpadd 2414 jnz .w32_wpad 2415.w32_loop: 2416 mova m0, [yq] 2417 mova m1, m0 2418 punpcklbw m1, m1 2419 pmaddubsw m1, m2 2420 mova [acq], m1 2421 paddw m5, m1, [rsp] 2422 mova [rsp ], m5 2423 punpckhbw m0, m0 2424 pmaddubsw m0, m2 2425 mova [acq+16], m0 2426 paddw m5, m0, [rsp+16] 2427 mova [rsp+16], m5 2428 mova m4, [yq+16] 2429 mova m3, m4 2430 punpcklbw m3, m3 2431 pmaddubsw m3, m2 2432 mova [acq+32], m3 2433 paddw m5, m3, [rsp+32] 2434 mova [rsp+32], m5 2435 punpckhbw m4, m4 2436 pmaddubsw m4, m2 2437 mova [acq+48], m4 2438 paddw m5, m4, [rsp+48] 2439 mova [rsp+48], m5 2440 lea yq, [yq+strideq] 2441 add acq, 64 2442 sub hd, 1 2443 jg .w32_loop 2444 test hpadd, hpadd 2445 jz .calc_avg_32 2446 jmp .w32_hpad_loop 2447.w32_wpad: 2448 cmp wpadd, 2 2449 jl .w32_pad1 2450 je .w32_pad2 2451 cmp wpadd, 4 2452 jl .w32_pad3 2453 je .w32_pad4 2454 cmp wpadd, 6 2455 jl .w32_pad5 2456 je .w32_pad6 2457.w32_pad7: 2458 movd m1, [yq] 2459 punpcklbw m1, m1 2460 punpcklqdq m1, m1 2461 pshufhw m1, m1, q3333 2462 pmaddubsw m1, m2 2463 mova [acq], m1 2464 paddw m5, m1, [rsp] 2465 mova [rsp ], m5 2466 mova m0, m1 2467 punpckhqdq m0, m0 2468 mova [acq+16], m0 2469 paddw m5, m0, [rsp+16] 2470 mova [rsp+16], m5 2471 mova m3, m0 2472 mova [acq+32], m3 2473 paddw m5, m3, [rsp+32] 2474 mova [rsp+32], m5 2475 mova m4, m3 2476 mova [acq+48], m4 2477 paddw m5, m4, [rsp+48] 2478 mova [rsp+48], m5 2479 lea yq, [yq+strideq] 2480 add acq, 64 2481 sub hd, 1 2482 jg .w32_pad7 2483 jmp .w32_wpad_done 2484.w32_pad6: 2485 mova m0, [yq] 2486 mova m1, m0 2487 punpcklbw m1, m1 2488 pmaddubsw m1, m2 2489 mova [acq], m1 2490 paddw m5, m1, [rsp] 2491 mova [rsp ], m5 2492 pshufhw m0, m1, q3333 2493 punpckhqdq m0, m0 2494 mova [acq+16], m0 2495 paddw m5, m0, [rsp+16] 2496 mova [rsp+16], m5 2497 mova m3, m0 2498 mova [acq+32], m3 2499 paddw m5, m3, [rsp+32] 2500 mova [rsp+32], m5 2501 mova m4, m3 2502 mova [acq+48], m4 2503 paddw m5, m4, [rsp+48] 2504 mova [rsp+48], m5 2505 lea yq, [yq+strideq] 2506 add acq, 64 2507 sub hd, 1 2508 jg .w32_pad6 2509 jmp .w32_wpad_done 2510.w32_pad5: 2511 mova m0, [yq] 2512 mova m1, m0 2513 punpcklbw m1, m1 2514 pmaddubsw m1, m2 2515 mova [acq], m1 2516 mova m5, [rsp] 2517 paddw m5, m1 2518 mova [rsp ], m5 2519 punpckhbw m0, m0 2520 punpcklqdq m0, m0 2521 pshufhw m0, m0, q3333 2522 pmaddubsw m0, m2 2523 mova [acq+16], m0 2524 paddw m5, m0, [rsp+16] 2525 mova [rsp+16], m5 2526 mova m3, m0 2527 punpckhqdq m3, m3 2528 mova [acq+32], m3 2529 paddw m5, m3, [rsp+32] 2530 mova [rsp+32], m5 2531 mova m4, m3 2532 mova [acq+48], m4 2533 paddw m5, m4, [rsp+48] 2534 mova [rsp+48], m5 2535 lea yq, [yq+strideq] 2536 add acq, 64 2537 sub hd, 1 2538 jg .w32_pad5 2539 jmp .w32_wpad_done 2540.w32_pad4: 2541 mova m0, [yq] 2542 mova m1, m0 2543 punpcklbw m1, m1 2544 pmaddubsw m1, m2 2545 mova [acq], m1 2546 paddw m5, m1, [rsp] 2547 mova [rsp ], m5 2548 punpckhbw m0, m0 2549 pmaddubsw m0, m2 2550 mova [acq+16], m0 2551 paddw m5, m0, [rsp+16] 2552 mova [rsp+16], m5 2553 mova m3, m0 2554 pshufhw m3, m3, q3333 2555 punpckhqdq m3, m3 2556 mova [acq+32], m3 2557 paddw m5, m3, [rsp+32] 2558 mova [rsp+32], m5 2559 mova m4, m3 2560 mova [acq+48], m4 2561 paddw m5, m4, [rsp+48] 2562 mova [rsp+48], m5 2563 lea yq, [yq+strideq] 2564 add acq, 64 2565 sub hd, 1 2566 jg .w32_pad4 2567 jmp .w32_wpad_done 2568.w32_pad3: 2569 mova m0, [yq] 2570 mova m1, m0 2571 punpcklbw m1, m1 2572 pmaddubsw m1, m2 2573 mova [acq], m1 2574 paddw m5, m1, [rsp] 2575 mova [rsp ], m5 2576 punpckhbw m0, m0 2577 pmaddubsw m0, m2 2578 mova [acq+16], m0 2579 paddw m5, m0, [rsp+16] 2580 mova [rsp+16], m5 2581 movd m3, [yq+16] 2582 punpcklbw m3, m3 2583 punpcklqdq m3, m3 2584 pshufhw m3, m3, q3333 2585 pmaddubsw m3, m2 2586 mova [acq+32], m3 2587 paddw m5, m3, [rsp+32] 2588 mova [rsp+32], m5 2589 mova m4, m3 2590 punpckhqdq m4, m4 2591 mova [acq+48], m4 2592 paddw m5, m4, [rsp+48] 2593 mova [rsp+48], m5 2594 lea yq, [yq+strideq] 2595 add acq, 64 2596 sub hd, 1 2597 jg .w32_pad3 2598 jmp .w32_wpad_done 2599.w32_pad2: 2600 mova m0, [yq] 2601 mova m1, m0 2602 punpcklbw m1, m1 2603 pmaddubsw m1, m2 2604 mova [acq], m1 2605 paddw m5, m1, [rsp] 2606 mova [rsp ], m5 2607 punpckhbw m0, m0 2608 pmaddubsw m0, m2 2609 mova [acq+16], m0 2610 paddw m5, m0, [rsp+16] 2611 mova [rsp+16], m5 2612 mova m3, [yq+16] 2613 punpcklbw m3, m3 2614 pmaddubsw m3, m2 2615 mova [acq+32], m3 2616 paddw m5, m3, [rsp+32] 2617 mova [rsp+32], m5 2618 pshufhw m4, m3, q3333 2619 punpckhqdq m4, m4 2620 mova [acq+48], m4 2621 paddw m5, m4, [rsp+48] 2622 mova [rsp+48], m5 2623 lea yq, [yq+strideq] 2624 add acq, 64 2625 sub hd, 1 2626 jg .w32_pad2 2627 jmp .w32_wpad_done 2628.w32_pad1: 2629 mova m0, [yq] 2630 mova m1, m0 2631 punpcklbw m1, m1 2632 pmaddubsw m1, m2 2633 mova [acq], m1 2634 paddw m5, m1, [rsp] 2635 mova [rsp ], m5 2636 punpckhbw m0, m0 2637 pmaddubsw m0, m2 2638 mova [acq+16], m0 2639 paddw m5, m0, [rsp+16] 2640 mova [rsp+16], m5 2641 mova m4, [yq+16] 2642 mova m3, m4 2643 punpcklbw m3, m3 2644 pmaddubsw m3, m2 2645 mova [acq+32], m3 2646 paddw m5, m3, [rsp+32] 2647 mova [rsp+32], m5 2648 punpckhbw m4, m4 2649 punpcklqdq m4, m4 2650 pshufhw m4, m4, q3333 2651 pmaddubsw m4, m2 2652 mova [acq+48], m4 2653 paddw m5, m4, [rsp+48] 2654 mova [rsp+48], m5 2655 lea yq, [yq+strideq] 2656 add acq, 64 2657 sub hd, 1 2658 jg .w32_pad1 2659.w32_wpad_done: 2660 test hpadd, hpadd 2661 jz .calc_avg_32 2662.w32_hpad_loop: 2663 mova [acq], m1 2664 mova [acq+16], m0 2665 paddw m5, m1, [rsp] 2666 mova [rsp ], m5 2667 paddw m5, m0, [rsp+16] 2668 mova [rsp+16], m5 2669 mova [acq+32], m3 2670 mova [acq+48], m4 2671 paddw m5, m3, [rsp+32] 2672 mova [rsp+32], m5 2673 paddw m5, m4, [rsp+48] 2674 mova [rsp+48], m5 2675 add acq, 64 2676 sub hpadd, 1 2677 jg .w32_hpad_loop 2678 2679%if ARCH_X86_64 2680 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak 2681%else 2682 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h 2683%endif 2684 2685.calc_avg_32: 2686 mova m5, [rsp] 2687 mova m0, m5 2688 psrld m5, 16 2689 pslld m0, 16 2690 psrld m0, 16 2691 paddd m5, m0 2692 mova m0, [rsp+16] 2693 mova m3, m0 2694 psrld m0, 16 2695 pslld m3, 16 2696 psrld m3, 16 2697 paddd m0, m3 2698 paddd m5, m0 2699 mova m0, [rsp+32] 2700 mova m3, m0 2701 psrld m0, 16 2702 pslld m3, 16 2703 psrld m3, 16 2704 paddd m0, m3 2705 mova m1, [rsp+48] 2706 mova m3, m1 2707 psrld m1, 16 2708 pslld m3, 16 2709 psrld m3, 16 2710 paddd m1, m3 2711 paddd m1, m0 2712 paddd m5, m1 2713.calc_avg: 2714 movd szd, m6 2715 psrad m6, 1 2716 tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); 2717 paddd m5, m6 2718 movd m1, r1d 2719 pshufd m0, m5, q2301 2720 paddd m0, m5 2721 pshufd m5, m0, q1032 2722 paddd m0, m5 2723 psrad m0, m1 ; sum >>= log2sz; 2724 packssdw m0, m0 2725 RELOAD_ACQ_32 acq ; ac = ac_orig 2726.sub_loop: 2727 mova m1, [acq] 2728 psubw m1, m0 2729 mova [acq], m1 2730 add acq, 16 2731 sub szd, 8 2732 jg .sub_loop 2733 RET 2734 2735; %1 simd register that hold the mask and will hold the result 2736; %2 simd register that holds the "true" values 2737; %3 location of the "false" values (simd register/memory) 2738%macro BLEND 3 ; mask, true, false 2739 pand %2, %1 2740 pandn %1, %3 2741 por %1, %2 2742%endmacro 2743 2744%macro PAETH 2 ; top, ldiff 2745 pavgb m1, m%1, m3 2746 pxor m0, m%1, m3 2747 pand m0, m4 2748 psubusb m2, m5, m1 2749 psubb m1, m0 2750 psubusb m1, m5 2751 por m1, m2 2752 paddusb m1, m1 2753 por m1, m0 ; min(tldiff, 255) 2754 psubusb m2, m5, m3 2755 psubusb m0, m3, m5 2756 por m2, m0 ; tdiff 2757%ifnum %2 2758 pminub m2, m%2 2759 pcmpeqb m0, m%2, m2 ; ldiff <= tdiff 2760%else 2761 mova m0, %2 2762 pminub m2, m0 2763 pcmpeqb m0, m2 2764%endif 2765 pminub m1, m2 2766 pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff 2767 mova m2, m3 2768 BLEND m0, m2, m%1 2769 BLEND m1, m0, m5 2770%endmacro 2771 2772cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h 2773%define base r5-ipred_paeth_ssse3_table 2774 tzcnt wd, wm 2775 movifnidn hd, hm 2776 pxor m0, m0 2777 movd m5, [tlq] 2778 pshufb m5, m0 2779 LEA r5, ipred_paeth_ssse3_table 2780 movsxd wq, [r5+wq*4] 2781 movddup m4, [base+ipred_paeth_shuf] 2782 add wq, r5 2783 jmp wq 2784.w4: 2785 movd m6, [tlq+1] ; top 2786 pshufd m6, m6, q0000 2787 lea r3, [strideq*3] 2788 psubusb m7, m5, m6 2789 psubusb m0, m6, m5 2790 por m7, m0 ; ldiff 2791.w4_loop: 2792 sub tlq, 4 2793 movd m3, [tlq] 2794 mova m1, [base+ipred_h_shuf] 2795 pshufb m3, m1 ; left 2796 PAETH 6, 7 2797 movd [dstq ], m1 2798 pshuflw m0, m1, q1032 2799 movd [dstq+strideq ], m0 2800 punpckhqdq m1, m1 2801 movd [dstq+strideq*2], m1 2802 psrlq m1, 32 2803 movd [dstq+r3 ], m1 2804 lea dstq, [dstq+strideq*4] 2805 sub hd, 4 2806 jg .w4_loop 2807 RET 2808ALIGN function_align 2809.w8: 2810 movddup m6, [tlq+1] 2811 psubusb m7, m5, m6 2812 psubusb m0, m6, m5 2813 por m7, m0 2814.w8_loop: 2815 sub tlq, 2 2816 movd m3, [tlq] 2817 pshufb m3, [base+ipred_paeth_shuf] 2818 PAETH 6, 7 2819 movq [dstq ], m1 2820 movhps [dstq+strideq], m1 2821 lea dstq, [dstq+strideq*2] 2822 sub hd, 2 2823 jg .w8_loop 2824 RET 2825ALIGN function_align 2826.w16: 2827 movu m6, [tlq+1] 2828 psubusb m7, m5, m6 2829 psubusb m0, m6, m5 2830 por m7, m0 2831.w16_loop: 2832 sub tlq, 1 2833 movd m3, [tlq] 2834 pxor m1, m1 2835 pshufb m3, m1 2836 PAETH 6, 7 2837 mova [dstq], m1 2838 add dstq, strideq 2839 sub hd, 1 2840 jg .w16_loop 2841 RET 2842ALIGN function_align 2843.w32: 2844 movu m6, [tlq+1] 2845 psubusb m7, m5, m6 2846 psubusb m0, m6, m5 2847 por m7, m0 2848 mova [rsp ], m6 2849 mova [rsp+16], m7 2850 movu m6, [tlq+17] 2851 psubusb m7, m5, m6 2852 psubusb m0, m6, m5 2853 por m7, m0 2854 mova [rsp+32], m6 2855.w32_loop: 2856 dec tlq 2857 movd m3, [tlq] 2858 pxor m1, m1 2859 pshufb m3, m1 2860 mova m6, [rsp] 2861 PAETH 6, [rsp+16] 2862 mova [dstq ], m1 2863 mova m6, [rsp+32] 2864 PAETH 6, 7 2865 mova [dstq+16], m1 2866 add dstq, strideq 2867 dec hd 2868 jg .w32_loop 2869 RET 2870ALIGN function_align 2871.w64: 2872 movu m6, [tlq+1] 2873 psubusb m7, m5, m6 2874 psubusb m0, m6, m5 2875 por m7, m0 2876 mova [rsp ], m6 2877 mova [rsp+16], m7 2878 movu m6, [tlq+17] 2879 psubusb m7, m5, m6 2880 psubusb m0, m6, m5 2881 por m7, m0 2882 mova [rsp+32], m6 2883 mova [rsp+48], m7 2884 movu m6, [tlq+33] 2885 psubusb m7, m5, m6 2886 psubusb m0, m6, m5 2887 por m7, m0 2888 mova [rsp+64], m6 2889 mova [rsp+80], m7 2890 movu m6, [tlq+49] 2891 psubusb m7, m5, m6 2892 psubusb m0, m6, m5 2893 por m7, m0 2894 mova [rsp+96], m6 2895.w64_loop: 2896 dec tlq 2897 movd m3, [tlq] 2898 pxor m1, m1 2899 pshufb m3, m1 2900 mova m6, [rsp] 2901 PAETH 6, [rsp+16] 2902 mova [dstq ], m1 2903 mova m6, [rsp+32] 2904 PAETH 6, [rsp+48] 2905 mova [dstq+16], m1 2906 mova m6, [rsp+64] 2907 PAETH 6, [rsp+80] 2908 mova [dstq+32], m1 2909 mova m6, [rsp+96] 2910 PAETH 6, 7 2911 mova [dstq+48], m1 2912 add dstq, strideq 2913 dec hd 2914 jg .w64_loop 2915 RET 2916 2917 2918%macro FILTER 4 ;dst, src, tmp, shuf 2919%ifnum %4 2920 pshufb m%2, m%4 2921%else 2922 pshufb m%2, %4 2923%endif 2924 pshufd m%1, m%2, q0000 ;p0 p1 2925 pmaddubsw m%1, m2 2926 pshufd m%3, m%2, q1111 ;p2 p3 2927 pmaddubsw m%3, m3 2928 paddw m%1, [base+pw_8] 2929 paddw m%1, m%3 2930 pshufd m%3, m%2, q2222 ;p4 p5 2931 pmaddubsw m%3, m4 2932 paddw m%1, m%3 2933 pshufd m%3, m%2, q3333 ;p6 __ 2934 pmaddubsw m%3, m5 2935 paddw m%1, m%3 2936 psraw m%1, 4 2937 packuswb m%1, m%1 2938%endmacro 2939 2940cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter 2941%define base r6-$$ 2942 LEA r6, $$ 2943 tzcnt wd, wm 2944%ifidn filterd, filterm 2945 movzx filterd, filterb 2946%else 2947 movzx filterd, byte filterm 2948%endif 2949 shl filterd, 6 2950 lea filterq, [base+filter_intra_taps+filterq] 2951 movq m0, [tlq-3] ;_ 6 5 0 1 2 3 4 2952 movsxd wq, [base+ipred_filter_ssse3_table+wq*4] 2953 mova m2, [filterq+16*0] 2954 mova m3, [filterq+16*1] 2955 mova m4, [filterq+16*2] 2956 mova m5, [filterq+16*3] 2957 lea wq, [base+ipred_filter_ssse3_table+wq] 2958 mov hd, hm 2959 jmp wq 2960.w4: 2961 mova m1, [base+filter_shuf1] 2962 sub tlq, 3 2963 sub tlq, hq 2964 jmp .w4_loop_start 2965.w4_loop: 2966 movd m0, [tlq+hq] 2967 punpckldq m0, m6 2968 lea dstq, [dstq+strideq*2] 2969.w4_loop_start: 2970 FILTER 6, 0, 7, 1 2971 movd [dstq+strideq*0], m6 2972 pshuflw m6, m6, q1032 2973 movd [dstq+strideq*1], m6 2974 sub hd, 2 2975 jg .w4_loop 2976 RET 2977 2978ALIGN function_align 2979.w8: 2980 movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4 2981 sub tlq, 5 2982 sub tlq, hq 2983 2984.w8_loop: 2985 FILTER 7, 0, 1, [base+filter_shuf1] 2986 punpcklqdq m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 2987 FILTER 0, 6, 1, [base+filter_shuf2] 2988 2989 punpckldq m6, m7, m0 2990 movq [dstq+strideq*0], m6 2991 punpckhqdq m6, m6 2992 movq [dstq+strideq*1], m6 2993 2994 movd m0, [tlq+hq] ;_ 6 5 0 2995 punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 2996 2997 lea dstq, [dstq+strideq*2] 2998 sub hd, 2 2999 jg .w8_loop 3000 RET 3001 3002ALIGN function_align 3003.w16: 3004 movu m6, [tlq+1] ;top row 3005 sub tlq, 5 3006 sub tlq, hq 3007 3008.w16_loop: 3009 FILTER 7, 0, 1, [base+filter_shuf1] 3010 punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 3011 movd [dstq+strideq*0], m7 3012 psrlq m7, 32 3013 palignr m7, m6, 4 3014 3015 FILTER 6, 0, 1, [base+filter_shuf2] 3016 punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 3017 movd [dstq+4+strideq*0], m6 3018 psrlq m6, 32 3019 palignr m6, m7, 4 3020 3021 FILTER 7, 0, 1, [base+filter_shuf2] 3022 punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 3023 movd [dstq+8+strideq*0], m7 3024 psrlq m7, 32 3025 palignr m7, m6, 4 3026 3027 FILTER 6, 0, 1, [base+filter_shuf2] 3028 movd [dstq+12+strideq*0], m6 3029 psrlq m6, 32 3030 palignr m6, m7, 4 3031 mova [dstq+strideq*1], m6 3032 3033 movd m0, [tlq+hq] ;_ 6 5 0 3034 punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 3035 3036 lea dstq, [dstq+strideq*2] 3037 sub hd, 2 3038 jg .w16_loop 3039 RET 3040 3041ALIGN function_align 3042.w32: 3043 movu m6, [tlq+1] ;top row 3044 lea filterq, [tlq+17] 3045 sub tlq, 5 3046 sub tlq, hq 3047 3048.w32_loop: 3049 FILTER 7, 0, 1, [base+filter_shuf1] 3050 punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 3051 movd [dstq+strideq*0], m7 3052 psrlq m7, 32 3053 palignr m7, m6, 4 3054 3055 FILTER 6, 0, 1, [base+filter_shuf2] 3056 punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 3057 movd [dstq+4+strideq*0], m6 3058 psrlq m6, 32 3059 palignr m6, m7, 4 3060 3061 FILTER 7, 0, 1, [base+filter_shuf2] 3062 punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 3063 movd [dstq+8+strideq*0], m7 3064 psrlq m7, 32 3065 palignr m7, m6, 4 3066 3067 FILTER 6, 0, 1, [base+filter_shuf2] 3068 movu m1, [filterq] 3069 punpckldq m0, m7, m1 ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _ 3070 punpcklqdq m0, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 3071 movd [dstq+12+strideq*0], m6 3072 psrlq m6, 32 3073 palignr m6, m7, 4 3074 mova [dstq+strideq*1], m6 3075 3076 mova m6, m1 3077 3078 FILTER 7, 0, 6, [base+filter_shuf2] 3079 punpcklqdq m0, m1, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 3080 movd [dstq+16+strideq*0], m7 3081 psrlq m7, 32 3082 palignr m7, m1, 4 3083 3084 FILTER 6, 0, 1, [base+filter_shuf2] 3085 punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 3086 movd [dstq+20+strideq*0], m6 3087 psrlq m6, 32 3088 palignr m6, m7, 4 3089 3090 FILTER 7, 0, 1, [base+filter_shuf2] 3091 punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 3092 movd [dstq+24+strideq*0], m7 3093 psrlq m7, 32 3094 palignr m7, m6, 4 3095 3096 FILTER 6, 0, 1, [base+filter_shuf2] 3097 movd [dstq+28+strideq*0], m6 3098 psrlq m6, 32 3099 palignr m6, m7, 4 3100 mova [dstq+16+strideq*1], m6 3101 3102 mova m6, [dstq+strideq*1] 3103 movd m0, [tlq+hq] ;_ 6 5 0 3104 punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 3105 lea filterq, [dstq+16+strideq*1] 3106 lea dstq, [dstq+strideq*2] 3107 sub hd, 2 3108 jg .w32_loop 3109 RET 3110