1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 64 30 31%macro SMOOTH_WEIGHTS 1-* 32const smooth_weights_1d_16bpc ; sm_weights[] << 7 33 %rep %0 34 dw %1*128 35 %rotate 1 36 %endrep 37const smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[] 38 %rep %0 39 dw %1, 256-%1 40 %rotate 1 41 %endrep 42%endmacro 43 44SMOOTH_WEIGHTS 0, 0, 255, 128, 255, 149, 85, 64, \ 45 255, 197, 146, 105, 73, 50, 37, 32, \ 46 255, 225, 196, 170, 145, 123, 102, 84, \ 47 68, 54, 43, 33, 26, 20, 17, 16, \ 48 255, 240, 225, 210, 196, 182, 169, 157, \ 49 145, 133, 122, 111, 101, 92, 83, 74, \ 50 66, 59, 52, 45, 39, 34, 29, 25, \ 51 21, 17, 14, 12, 10, 9, 8, 8, \ 52 255, 248, 240, 233, 225, 218, 210, 203, \ 53 196, 189, 182, 176, 169, 163, 156, 150, \ 54 144, 138, 133, 127, 121, 116, 111, 106, \ 55 101, 96, 91, 86, 82, 77, 73, 69, \ 56 65, 61, 57, 54, 50, 47, 44, 41, \ 57 38, 35, 32, 29, 27, 25, 22, 20, \ 58 18, 16, 15, 13, 12, 10, 9, 8, \ 59 7, 6, 6, 5, 5, 4, 4, 4 60 61%if ARCH_X86_64 62 63ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11 64 db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15 65filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1 66filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 67filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1 68pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 69z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 70 dw 8*64, 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64 71z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 72z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 73z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 74 db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 75pw_m1024: times 2 dw -1024 76pw_1to16: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 77pw_16to1: dw 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 78z2_ymul: dw 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4 79z2_ymul8: dw 1, 2, 5, 6, 3, 4, 7, 8, 5, 6, 16, 16, 7, 8 80pb_90: times 4 db 90 81z2_y_shuf_h4: dd 3, 7, 2, 6, 1, 5, 0, 4 82z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 83z2_x_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 84z2_y_shuf: db 6, 7, 14, 15, 4, 5, 12, 13, 4, 5, 12, 13, 2, 3, 10, 11 85z2_y_shuf_us: db 6, 7, 14, 15, 2, 3, 10, 11, 4, 5, 12, 13, 0, 1, 8, 9 86z_filter_k: dw 4, 4, 5, 5, 4, 4 87 dw 8, 8, 6, 6, 4, 4 88 dw 0, 0, 0, 0, 2, 2 89 90%define pw_2 (z_filter_k+32) 91%define pw_4 (z_filter_k+ 0) 92%define pw_16 (z2_ymul8 +20) 93 94pw_1: times 2 dw 1 95pw_3: times 2 dw 3 96pw_62: times 2 dw 62 97pw_512: times 2 dw 512 98pw_2048: times 2 dw 2048 99pd_8: dd 8 100 101%macro JMP_TABLE 3-* 102 %xdefine %1_%2_table (%%table - 2*4) 103 %xdefine %%base mangle(private_prefix %+ _%1_%2) 104 %%table: 105 %rep %0 - 2 106 dd %%base %+ .%3 - (%%table - 2*4) 107 %rotate 1 108 %endrep 109%endmacro 110 111%define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4) 112%define ipred_cfl_splat_16bpc_avx2_table (ipred_cfl_16bpc_avx2_table + 8*4) 113 114JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ 115 s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 116JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64 117JMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64 118JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64 119JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64 120JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64 121JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64 122JMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64 123JMP_TABLE ipred_z2_16bpc, avx2, w4, w8, w16, w32, w64 124JMP_TABLE ipred_z3_16bpc, avx2, h4, h8, h16, h32, h64 125JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32 126JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ 127 s4-8*4, s8-8*4, s16-8*4, s32-8*4 128JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32 129JMP_TABLE ipred_cfl_ac_444_16bpc, avx2, w4, w8, w16, w32 130JMP_TABLE pal_pred_16bpc, avx2, w4, w8, w16, w32, w64 131 132cextern dr_intra_derivative 133cextern filter_intra_taps 134 135SECTION .text 136 137INIT_YMM avx2 138cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h 139 movifnidn hd, hm 140 add tlq, 2 141 movd xm4, wd 142 pxor xm3, xm3 143 pavgw xm4, xm3 144 tzcnt wd, wd 145 movd xm5, wd 146 movu m0, [tlq] 147 lea r5, [ipred_dc_left_16bpc_avx2_table] 148 movsxd r6, [r5+wq*4] 149 add r6, r5 150 add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table 151 movsxd wq, [r5+wq*4] 152 add wq, r5 153 jmp r6 154 155cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 156 mov hd, hm 157 sub tlq, hq 158 movd xm4, hd 159 sub tlq, hq 160 pxor xm3, xm3 161 pavgw xm4, xm3 162 tzcnt r6d, hd 163 movd xm5, r6d 164 movu m0, [tlq] 165 lea r5, [ipred_dc_left_16bpc_avx2_table] 166 movsxd r6, [r5+r6*4] 167 add r6, r5 168 add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table 169 tzcnt wd, wd 170 movsxd wq, [r5+wq*4] 171 add wq, r5 172 jmp r6 173.h64: 174 paddw m0, [tlq+96] 175 paddw m0, [tlq+64] 176.h32: 177 paddw m0, [tlq+32] 178.h16: 179 vextracti128 xm1, m0, 1 180 paddw xm0, xm1 181.h8: 182 psrldq xm1, xm0, 8 183 paddw xm0, xm1 184.h4: 185 punpcklwd xm0, xm3 186 psrlq xm1, xm0, 32 187 paddd xm0, xm1 188 psrldq xm1, xm0, 8 189 paddd xm0, xm1 190 paddd xm0, xm4 191 psrld xm0, xm5 192 lea stride3q, [strideq*3] 193 vpbroadcastw m0, xm0 194 mova m1, m0 195 mova m2, m0 196 mova m3, m0 197 jmp wq 198 199cglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 200 movifnidn hd, hm 201 tzcnt r6d, hd 202 lea r5d, [wq+hq] 203 movd xm4, r5d 204 tzcnt r5d, r5d 205 movd xm5, r5d 206 lea r5, [ipred_dc_16bpc_avx2_table] 207 tzcnt wd, wd 208 movsxd r6, [r5+r6*4] 209 movsxd wq, [r5+wq*4+5*4] 210 pxor m3, m3 211 psrlw xm4, 1 212 add r6, r5 213 add wq, r5 214 lea stride3q, [strideq*3] 215 jmp r6 216.h4: 217 movq xm0, [tlq-8] 218 jmp wq 219.w4: 220 movq xm1, [tlq+2] 221 paddw m0, m4 222 paddw m0, m1 223 psrlq m1, m0, 32 224 paddw m0, m1 225 psrld m1, m0, 16 226 paddw m0, m1 227 cmp hd, 4 228 jg .w4_mul 229 psrlw xm0, 3 230 jmp .w4_end 231.w4_mul: 232 vextracti128 xm1, m0, 1 233 paddw xm0, xm1 234 lea r2d, [hq*2] 235 mov r6d, 0xAAAB6667 236 shrx r6d, r6d, r2d 237 punpckhwd xm1, xm0, xm3 238 punpcklwd xm0, xm3 239 paddd xm0, xm1 240 movd xm1, r6d 241 psrld xm0, 2 242 pmulhuw xm0, xm1 243 psrlw xm0, 1 244.w4_end: 245 vpbroadcastw xm0, xm0 246.s4: 247 movq [dstq+strideq*0], xm0 248 movq [dstq+strideq*1], xm0 249 movq [dstq+strideq*2], xm0 250 movq [dstq+stride3q ], xm0 251 lea dstq, [dstq+strideq*4] 252 sub hd, 4 253 jg .s4 254 RET 255ALIGN function_align 256.h8: 257 mova xm0, [tlq-16] 258 jmp wq 259.w8: 260 vextracti128 xm1, m0, 1 261 paddw xm0, [tlq+2] 262 paddw xm0, xm4 263 paddw xm0, xm1 264 psrld xm1, xm0, 16 265 paddw xm0, xm1 266 pblendw xm0, xm3, 0xAA 267 psrlq xm1, xm0, 32 268 paddd xm0, xm1 269 psrldq xm1, xm0, 8 270 paddd xm0, xm1 271 psrld xm0, xm5 272 cmp hd, 8 273 je .w8_end 274 mov r6d, 0xAAAB 275 mov r2d, 0x6667 276 cmp hd, 32 277 cmovz r6d, r2d 278 movd xm1, r6d 279 pmulhuw xm0, xm1 280 psrlw xm0, 1 281.w8_end: 282 vpbroadcastw xm0, xm0 283.s8: 284 mova [dstq+strideq*0], xm0 285 mova [dstq+strideq*1], xm0 286 mova [dstq+strideq*2], xm0 287 mova [dstq+stride3q ], xm0 288 lea dstq, [dstq+strideq*4] 289 sub hd, 4 290 jg .s8 291 RET 292ALIGN function_align 293.h16: 294 mova m0, [tlq-32] 295 jmp wq 296.w16: 297 paddw m0, [tlq+2] 298 vextracti128 xm1, m0, 1 299 paddw xm0, xm4 300 paddw xm0, xm1 301 punpckhwd xm1, xm0, xm3 302 punpcklwd xm0, xm3 303 paddd xm0, xm1 304 psrlq xm1, xm0, 32 305 paddd xm0, xm1 306 psrldq xm1, xm0, 8 307 paddd xm0, xm1 308 psrld xm0, xm5 309 cmp hd, 16 310 je .w16_end 311 mov r6d, 0xAAAB 312 mov r2d, 0x6667 313 test hb, 8|32 314 cmovz r6d, r2d 315 movd xm1, r6d 316 pmulhuw xm0, xm1 317 psrlw xm0, 1 318.w16_end: 319 vpbroadcastw m0, xm0 320.s16: 321 mova [dstq+strideq*0], m0 322 mova [dstq+strideq*1], m0 323 mova [dstq+strideq*2], m0 324 mova [dstq+stride3q ], m0 325 lea dstq, [dstq+strideq*4] 326 sub hd, 4 327 jg .s16 328 RET 329ALIGN function_align 330.h32: 331 mova m0, [tlq-64] 332 paddw m0, [tlq-32] 333 jmp wq 334.w32: 335 paddw m0, [tlq+ 2] 336 paddw m0, [tlq+34] 337 vextracti128 xm1, m0, 1 338 paddw xm0, xm4 339 paddw xm0, xm1 340 punpcklwd xm1, xm0, xm3 341 punpckhwd xm0, xm3 342 paddd xm0, xm1 343 psrlq xm1, xm0, 32 344 paddd xm0, xm1 345 psrldq xm1, xm0, 8 346 paddd xm0, xm1 347 psrld xm0, xm5 348 cmp hd, 32 349 je .w32_end 350 lea r2d, [hq*2] 351 mov r6d, 0x6667AAAB 352 shrx r6d, r6d, r2d 353 movd xm1, r6d 354 pmulhuw xm0, xm1 355 psrlw xm0, 1 356.w32_end: 357 vpbroadcastw m0, xm0 358 mova m1, m0 359.s32: 360 mova [dstq+strideq*0+32*0], m0 361 mova [dstq+strideq*0+32*1], m1 362 mova [dstq+strideq*1+32*0], m0 363 mova [dstq+strideq*1+32*1], m1 364 mova [dstq+strideq*2+32*0], m0 365 mova [dstq+strideq*2+32*1], m1 366 mova [dstq+stride3q +32*0], m0 367 mova [dstq+stride3q +32*1], m1 368 lea dstq, [dstq+strideq*4] 369 sub hd, 4 370 jg .s32 371 RET 372ALIGN function_align 373.h64: 374 mova m0, [tlq-128] 375 mova m1, [tlq- 96] 376 paddw m0, [tlq- 64] 377 paddw m1, [tlq- 32] 378 paddw m0, m1 379 jmp wq 380.w64: 381 movu m1, [tlq+ 2] 382 paddw m0, [tlq+34] 383 paddw m1, [tlq+66] 384 paddw m0, [tlq+98] 385 paddw m0, m1 386 vextracti128 xm1, m0, 1 387 paddw xm0, xm1 388 punpcklwd xm1, xm0, xm3 389 punpckhwd xm0, xm3 390 paddd xm1, xm4 391 paddd xm0, xm1 392 psrlq xm1, xm0, 32 393 paddd xm0, xm1 394 psrldq xm1, xm0, 8 395 paddd xm0, xm1 396 psrld xm0, xm5 397 cmp hd, 64 398 je .w64_end 399 mov r6d, 0x6667AAAB 400 shrx r6d, r6d, hd 401 movd xm1, r6d 402 pmulhuw xm0, xm1 403 psrlw xm0, 1 404.w64_end: 405 vpbroadcastw m0, xm0 406 mova m1, m0 407 mova m2, m0 408 mova m3, m0 409.s64: 410 mova [dstq+strideq*0+32*0], m0 411 mova [dstq+strideq*0+32*1], m1 412 mova [dstq+strideq*0+32*2], m2 413 mova [dstq+strideq*0+32*3], m3 414 mova [dstq+strideq*1+32*0], m0 415 mova [dstq+strideq*1+32*1], m1 416 mova [dstq+strideq*1+32*2], m2 417 mova [dstq+strideq*1+32*3], m3 418 lea dstq, [dstq+strideq*2] 419 sub hd, 2 420 jg .s64 421 RET 422 423cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 424 mov r6d, r8m 425 shr r6d, 11 426 lea r5, [ipred_dc_splat_16bpc_avx2_table] 427 tzcnt wd, wd 428 movifnidn hd, hm 429 movsxd wq, [r5+wq*4] 430 vpbroadcastd m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4] 431 mova m1, m0 432 mova m2, m0 433 mova m3, m0 434 add wq, r5 435 lea stride3q, [strideq*3] 436 jmp wq 437 438cglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 439 movifnidn hd, hm 440 movu m0, [tlq+ 2] 441 movu m1, [tlq+34] 442 movu m2, [tlq+66] 443 movu m3, [tlq+98] 444 lea r5, [ipred_dc_splat_16bpc_avx2_table] 445 tzcnt wd, wd 446 movsxd wq, [r5+wq*4] 447 add wq, r5 448 lea stride3q, [strideq*3] 449 jmp wq 450 451%macro IPRED_H 2 ; w, store_type 452 vpbroadcastw m0, [tlq-2] 453 vpbroadcastw m1, [tlq-4] 454 vpbroadcastw m2, [tlq-6] 455 vpbroadcastw m3, [tlq-8] 456 sub tlq, 8 457 mov%2 [dstq+strideq*0], m0 458 mov%2 [dstq+strideq*1], m1 459 mov%2 [dstq+strideq*2], m2 460 mov%2 [dstq+stride3q ], m3 461 lea dstq, [dstq+strideq*4] 462 sub hd, 4 463 jg .w%1 464 RET 465ALIGN function_align 466%endmacro 467 468cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 469 movifnidn hd, hm 470 lea r5, [ipred_h_16bpc_avx2_table] 471 tzcnt wd, wd 472 movsxd wq, [r5+wq*4] 473 add wq, r5 474 lea stride3q, [strideq*3] 475 jmp wq 476INIT_XMM avx2 477.w4: 478 IPRED_H 4, q 479.w8: 480 IPRED_H 8, a 481INIT_YMM avx2 482.w16: 483 IPRED_H 16, a 484.w32: 485 vpbroadcastw m0, [tlq-2] 486 vpbroadcastw m1, [tlq-4] 487 vpbroadcastw m2, [tlq-6] 488 vpbroadcastw m3, [tlq-8] 489 sub tlq, 8 490 mova [dstq+strideq*0+32*0], m0 491 mova [dstq+strideq*0+32*1], m0 492 mova [dstq+strideq*1+32*0], m1 493 mova [dstq+strideq*1+32*1], m1 494 mova [dstq+strideq*2+32*0], m2 495 mova [dstq+strideq*2+32*1], m2 496 mova [dstq+stride3q +32*0], m3 497 mova [dstq+stride3q +32*1], m3 498 lea dstq, [dstq+strideq*4] 499 sub hd, 4 500 jg .w32 501 RET 502.w64: 503 vpbroadcastw m0, [tlq-2] 504 vpbroadcastw m1, [tlq-4] 505 sub tlq, 4 506 mova [dstq+strideq*0+32*0], m0 507 mova [dstq+strideq*0+32*1], m0 508 mova [dstq+strideq*0+32*2], m0 509 mova [dstq+strideq*0+32*3], m0 510 mova [dstq+strideq*1+32*0], m1 511 mova [dstq+strideq*1+32*1], m1 512 mova [dstq+strideq*1+32*2], m1 513 mova [dstq+strideq*1+32*3], m1 514 lea dstq, [dstq+strideq*2] 515 sub hd, 2 516 jg .w64 517 RET 518 519%macro PAETH 3 ; top, signed_ldiff, ldiff 520 paddw m0, m%2, m1 521 psubw m7, m3, m0 ; tldiff 522 psubw m0, m%1 ; tdiff 523 pabsw m7, m7 524 pabsw m0, m0 525 pminsw m7, m0 526 pcmpeqw m0, m7 527 pcmpgtw m7, m%3, m7 528 vpblendvb m0, m3, m%1, m0 529 vpblendvb m0, m1, m0, m7 530%endmacro 531 532cglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h 533%define base r5-ipred_paeth_16bpc_avx2_table 534 movifnidn hd, hm 535 lea r5, [ipred_paeth_16bpc_avx2_table] 536 tzcnt wd, wd 537 movsxd wq, [r5+wq*4] 538 vpbroadcastw m3, [tlq] ; topleft 539 add wq, r5 540 jmp wq 541.w4: 542 vpbroadcastq m2, [tlq+2] ; top 543 movsldup m6, [base+ipred_hv_shuf] 544 lea r3, [strideq*3] 545 psubw m4, m2, m3 546 pabsw m5, m4 547.w4_loop: 548 sub tlq, 8 549 vpbroadcastq m1, [tlq] 550 pshufb m1, m6 ; left 551 PAETH 2, 4, 5 552 vextracti128 xm1, m0, 1 553 movq [dstq+strideq*0], xm0 554 movq [dstq+strideq*1], xm1 555 movhps [dstq+strideq*2], xm0 556 movhps [dstq+r3 ], xm1 557 lea dstq, [dstq+strideq*4] 558 sub hd, 4 559 jg .w4_loop 560 RET 561ALIGN function_align 562.w8: 563 vbroadcasti128 m2, [tlq+2] 564 movsldup m6, [base+ipred_hv_shuf] 565 psubw m4, m2, m3 566 pabsw m5, m4 567.w8_loop: 568 sub tlq, 4 569 vpbroadcastd m1, [tlq] 570 pshufb m1, m6 571 PAETH 2, 4, 5 572 mova [dstq+strideq*0], xm0 573 vextracti128 [dstq+strideq*1], m0, 1 574 lea dstq, [dstq+strideq*2] 575 sub hd, 2 576 jg .w8_loop 577 RET 578ALIGN function_align 579.w16: 580 movu m2, [tlq+2] 581 psubw m4, m2, m3 582 pabsw m5, m4 583.w16_loop: 584 sub tlq, 2 585 vpbroadcastw m1, [tlq] 586 PAETH 2, 4, 5 587 mova [dstq], m0 588 add dstq, strideq 589 dec hd 590 jg .w16_loop 591 RET 592ALIGN function_align 593.w32: 594 movu m2, [tlq+2] 595 movu m6, [tlq+34] 596%if WIN64 597 movaps r4m, xmm8 598 movaps r6m, xmm9 599%endif 600 psubw m4, m2, m3 601 psubw m8, m6, m3 602 pabsw m5, m4 603 pabsw m9, m8 604.w32_loop: 605 sub tlq, 2 606 vpbroadcastw m1, [tlq] 607 PAETH 2, 4, 5 608 mova [dstq+32*0], m0 609 PAETH 6, 8, 9 610 mova [dstq+32*1], m0 611 add dstq, strideq 612 dec hd 613 jg .w32_loop 614%if WIN64 615 movaps xmm8, r4m 616 movaps xmm9, r6m 617%endif 618 RET 619ALIGN function_align 620.w64: 621 WIN64_SPILL_XMM 16 622 movu m2, [tlq+ 2] 623 movu m6, [tlq+34] 624 movu m10, [tlq+66] 625 movu m13, [tlq+98] 626 psubw m4, m2, m3 627 psubw m8, m6, m3 628 psubw m11, m10, m3 629 psubw m14, m13, m3 630 pabsw m5, m4 631 pabsw m9, m8 632 pabsw m12, m11 633 pabsw m15, m14 634.w64_loop: 635 sub tlq, 2 636 vpbroadcastw m1, [tlq] 637 PAETH 2, 4, 5 638 mova [dstq+32*0], m0 639 PAETH 6, 8, 9 640 mova [dstq+32*1], m0 641 PAETH 10, 11, 12 642 mova [dstq+32*2], m0 643 PAETH 13, 14, 15 644 mova [dstq+32*3], m0 645 add dstq, strideq 646 dec hd 647 jg .w64_loop 648 RET 649 650cglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights 651%define base r6-ipred_smooth_v_16bpc_avx2_table 652 lea r6, [ipred_smooth_v_16bpc_avx2_table] 653 tzcnt wd, wm 654 mov hd, hm 655 movsxd wq, [r6+wq*4] 656 lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] 657 neg hq 658 vpbroadcastw m5, [tlq+hq*2] ; bottom 659 add wq, r6 660 jmp wq 661.w4: 662 vpbroadcastq m4, [tlq+2] ; top 663 movsldup m3, [base+ipred_hv_shuf] 664 lea r6, [strideq*3] 665 psubw m4, m5 ; top - bottom 666.w4_loop: 667 vpbroadcastq m0, [weightsq+hq*2] 668 pshufb m0, m3 669 pmulhrsw m0, m4 670 paddw m0, m5 671 vextracti128 xm1, m0, 1 672 movhps [dstq+strideq*0], xm1 673 movhps [dstq+strideq*1], xm0 674 movq [dstq+strideq*2], xm1 675 movq [dstq+r6 ], xm0 676 lea dstq, [dstq+strideq*4] 677 add hq, 4 678 jl .w4_loop 679.ret: 680 RET 681.w8: 682 vbroadcasti128 m4, [tlq+2] 683 movsldup m3, [base+ipred_hv_shuf] 684 lea r6, [strideq*3] 685 psubw m4, m5 686.w8_loop: 687 vpbroadcastd m0, [weightsq+hq*2+0] 688 vpbroadcastd m1, [weightsq+hq*2+4] 689 pshufb m0, m3 690 pshufb m1, m3 691 pmulhrsw m0, m4 692 pmulhrsw m1, m4 693 paddw m0, m5 694 paddw m1, m5 695 vextracti128 [dstq+strideq*0], m0, 1 696 mova [dstq+strideq*1], xm0 697 vextracti128 [dstq+strideq*2], m1, 1 698 mova [dstq+r6 ], xm1 699 lea dstq, [dstq+strideq*4] 700 add hq, 4 701 jl .w8_loop 702 RET 703.w16: 704 movu m4, [tlq+2] 705 lea r6, [strideq*3] 706 psubw m4, m5 707.w16_loop: 708 vpbroadcastw m0, [weightsq+hq*2+0] 709 vpbroadcastw m1, [weightsq+hq*2+2] 710 vpbroadcastw m2, [weightsq+hq*2+4] 711 vpbroadcastw m3, [weightsq+hq*2+6] 712 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 713 REPX {paddw x, m5}, m0, m1, m2, m3 714 mova [dstq+strideq*0], m0 715 mova [dstq+strideq*1], m1 716 mova [dstq+strideq*2], m2 717 mova [dstq+r6 ], m3 718 lea dstq, [dstq+strideq*4] 719 add hq, 4 720 jl .w16_loop 721 RET 722.w32: 723 WIN64_SPILL_XMM 7 724 movu m4, [tlq+ 2] 725 movu m6, [tlq+34] 726 psubw m4, m5 727 psubw m6, m5 728.w32_loop: 729 vpbroadcastw m1, [weightsq+hq*2+0] 730 vpbroadcastw m3, [weightsq+hq*2+2] 731 pmulhrsw m0, m4, m1 732 pmulhrsw m1, m6 733 pmulhrsw m2, m4, m3 734 pmulhrsw m3, m6 735 REPX {paddw x, m5}, m0, m1, m2, m3 736 mova [dstq+strideq*0+32*0], m0 737 mova [dstq+strideq*0+32*1], m1 738 mova [dstq+strideq*1+32*0], m2 739 mova [dstq+strideq*1+32*1], m3 740 lea dstq, [dstq+strideq*2] 741 add hq, 2 742 jl .w32_loop 743 RET 744.w64: 745 WIN64_SPILL_XMM 8 746 movu m3, [tlq+ 2] 747 movu m4, [tlq+34] 748 movu m6, [tlq+66] 749 movu m7, [tlq+98] 750 REPX {psubw x, m5}, m3, m4, m6, m7 751.w64_loop: 752 vpbroadcastw m2, [weightsq+hq*2] 753 pmulhrsw m0, m3, m2 754 pmulhrsw m1, m4, m2 755 paddw m0, m5 756 paddw m1, m5 757 mova [dstq+32*0], m0 758 pmulhrsw m0, m6, m2 759 mova [dstq+32*1], m1 760 pmulhrsw m1, m7, m2 761 paddw m0, m5 762 paddw m1, m5 763 mova [dstq+32*2], m0 764 mova [dstq+32*3], m1 765 add dstq, strideq 766 inc hq 767 jl .w64_loop 768 RET 769 770cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 771%define base r6-ipred_smooth_h_16bpc_avx2_table 772 lea r6, [ipred_smooth_h_16bpc_avx2_table] 773 mov wd, wm 774 movifnidn hd, hm 775 vpbroadcastw m5, [tlq+wq*2] ; right 776 tzcnt wd, wd 777 add hd, hd 778 movsxd wq, [r6+wq*4] 779 sub tlq, hq 780 lea stride3q, [strideq*3] 781 add wq, r6 782 jmp wq 783.w4: 784 vpbroadcastq m4, [base+smooth_weights_1d_16bpc+4*2] 785 movsldup m3, [base+ipred_hv_shuf] 786.w4_loop: 787 vpbroadcastq m0, [tlq+hq-8] ; left 788 pshufb m0, m3 789 psubw m0, m5 ; left - right 790 pmulhrsw m0, m4 791 paddw m0, m5 792 vextracti128 xm1, m0, 1 793 movq [dstq+strideq*0], xm0 794 movq [dstq+strideq*1], xm1 795 movhps [dstq+strideq*2], xm0 796 movhps [dstq+stride3q ], xm1 797 lea dstq, [dstq+strideq*4] 798 sub hd, 4*2 799 jg .w4_loop 800 RET 801.w8: 802 vbroadcasti128 m4, [base+smooth_weights_1d_16bpc+8*2] 803 movsldup m3, [base+ipred_hv_shuf] 804.w8_loop: 805 vpbroadcastd m0, [tlq+hq-4] 806 vpbroadcastd m1, [tlq+hq-8] 807 pshufb m0, m3 808 pshufb m1, m3 809 psubw m0, m5 810 psubw m1, m5 811 pmulhrsw m0, m4 812 pmulhrsw m1, m4 813 paddw m0, m5 814 paddw m1, m5 815 mova [dstq+strideq*0], xm0 816 vextracti128 [dstq+strideq*1], m0, 1 817 mova [dstq+strideq*2], xm1 818 vextracti128 [dstq+stride3q ], m1, 1 819 lea dstq, [dstq+strideq*4] 820 sub hq, 4*2 821 jg .w8_loop 822 RET 823.w16: 824 movu m4, [base+smooth_weights_1d_16bpc+16*2] 825.w16_loop: 826 vpbroadcastq m3, [tlq+hq-8] 827 punpcklwd m3, m3 828 psubw m3, m5 829 pshufd m0, m3, q3333 830 pshufd m1, m3, q2222 831 pshufd m2, m3, q1111 832 pshufd m3, m3, q0000 833 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 834 REPX {paddw x, m5}, m0, m1, m2, m3 835 mova [dstq+strideq*0], m0 836 mova [dstq+strideq*1], m1 837 mova [dstq+strideq*2], m2 838 mova [dstq+stride3q ], m3 839 lea dstq, [dstq+strideq*4] 840 sub hq, 4*2 841 jg .w16_loop 842 RET 843.w32: 844 WIN64_SPILL_XMM 7 845 movu m4, [base+smooth_weights_1d_16bpc+32*2] 846 movu m6, [base+smooth_weights_1d_16bpc+32*3] 847.w32_loop: 848 vpbroadcastw m1, [tlq+hq-2] 849 vpbroadcastw m3, [tlq+hq-4] 850 psubw m1, m5 851 psubw m3, m5 852 pmulhrsw m0, m4, m1 853 pmulhrsw m1, m6 854 pmulhrsw m2, m4, m3 855 pmulhrsw m3, m6 856 REPX {paddw x, m5}, m0, m1, m2, m3 857 mova [dstq+strideq*0+32*0], m0 858 mova [dstq+strideq*0+32*1], m1 859 mova [dstq+strideq*1+32*0], m2 860 mova [dstq+strideq*1+32*1], m3 861 lea dstq, [dstq+strideq*2] 862 sub hq, 2*2 863 jg .w32_loop 864 RET 865.w64: 866 WIN64_SPILL_XMM 8 867 movu m3, [base+smooth_weights_1d_16bpc+32*4] 868 movu m4, [base+smooth_weights_1d_16bpc+32*5] 869 movu m6, [base+smooth_weights_1d_16bpc+32*6] 870 movu m7, [base+smooth_weights_1d_16bpc+32*7] 871.w64_loop: 872 vpbroadcastw m2, [tlq+hq-2] 873 psubw m2, m5 874 pmulhrsw m0, m3, m2 875 pmulhrsw m1, m4, m2 876 paddw m0, m5 877 paddw m1, m5 878 mova [dstq+32*0], m0 879 pmulhrsw m0, m6, m2 880 mova [dstq+32*1], m1 881 pmulhrsw m1, m7, m2 882 paddw m0, m5 883 paddw m1, m5 884 mova [dstq+32*2], m0 885 mova [dstq+32*3], m1 886 add dstq, strideq 887 sub hq, 1*2 888 jg .w64_loop 889 RET 890 891%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] 892 pmaddwd m0, m%1, m%3 893 pmaddwd m1, m%2, m%4 894 paddd m0, m%5 895 paddd m1, m%6 896 psrld m0, 8 897 psrld m1, 8 898 packssdw m0, m1 899 pavgw m0, m5 900%endmacro 901 902cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights 903%define base r6-ipred_smooth_16bpc_avx2_table 904 lea r6, [ipred_smooth_16bpc_avx2_table] 905 mov wd, wm 906 vpbroadcastw m4, [tlq+wq*2] ; right 907 tzcnt wd, wd 908 mov hd, hm 909 sub tlq, hq 910 sub tlq, hq 911 movsxd wq, [r6+wq*4] 912 pxor m5, m5 913 add wq, r6 914 lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*4] 915 jmp wq 916.w4: 917 WIN64_SPILL_XMM 11 918 vpbroadcastw m0, [tlq] ; bottom 919 vpbroadcastq m6, [tlq+hq*2+2] 920 movsldup m7, [base+ipred_hv_shuf] 921 movshdup m9, [base+ipred_hv_shuf] 922 vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+4*4] 923 punpcklwd m6, m0 ; top, bottom 924 punpcklqdq m8, m9, m9 925 punpckhqdq m9, m9 926 lea r3, [strideq*3] 927.w4_loop: 928 vpbroadcastq m3, [tlq+hq*2-8] 929 vbroadcasti128 m1, [v_weightsq] 930 pshufb m3, m7 931 punpcklwd m2, m3, m4 ; left, right 932 punpckhwd m3, m4 933 pmaddwd m2, m10 934 pmaddwd m3, m10 935 pshufb m0, m1, m8 936 pshufb m1, m9 937 SMOOTH_2D_END 0, 1, 6, 6, 2, 3 938 vextracti128 xm1, m0, 1 939 movq [dstq+strideq*0], xm0 940 movq [dstq+strideq*1], xm1 941 movhps [dstq+strideq*2], xm0 942 movhps [dstq+r3 ], xm1 943 lea dstq, [dstq+strideq*4] 944 add v_weightsq, 16 945 sub hd, 4 946 jg .w4_loop 947 RET 948.w8: 949%assign stack_offset stack_offset - stack_size_padded 950 WIN64_SPILL_XMM 12 951 vpbroadcastw m0, [tlq] ; bottom 952 vbroadcasti128 m7, [tlq+hq*2+2] 953 movsldup m8, [base+ipred_hv_shuf] 954 movshdup m9, [base+ipred_hv_shuf] 955 vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+8*4+16*0] 956 vbroadcasti128 m11, [base+smooth_weights_2d_16bpc+8*4+16*1] 957 punpcklwd m6, m7, m0 ; top, bottom 958 punpckhwd m7, m0 959.w8_loop: 960 vpbroadcastd m3, [tlq+hq*2-4] 961 vpbroadcastq m1, [v_weightsq] 962 pshufb m3, m8 963 punpcklwd m2, m3, m4 ; left, right 964 punpckhwd m3, m4 965 pmaddwd m2, m10 966 pmaddwd m3, m11 967 pshufb m1, m9 968 SMOOTH_2D_END 1, 1, 6, 7, 2, 3 969 mova [dstq+strideq*0], xm0 970 vextracti128 [dstq+strideq*1], m0, 1 971 lea dstq, [dstq+strideq*2] 972 add v_weightsq, 8 973 sub hd, 2 974 jg .w8_loop 975 RET 976.w16: 977%assign stack_offset stack_offset - stack_size_padded 978 WIN64_SPILL_XMM 11 979 vpbroadcastw m0, [tlq] ; bottom 980 movu m7, [tlq+hq*2+2] 981 mova xm8, [base+smooth_weights_2d_16bpc+16*4+16*0] 982 mova xm9, [base+smooth_weights_2d_16bpc+16*4+16*1] 983 vinserti128 m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1 984 vinserti128 m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1 985 punpcklwd m6, m7, m0 ; top, bottom 986 punpckhwd m7, m0 987.w16_loop: 988 vpbroadcastd m3, [tlq+hq*2-4] 989 vpbroadcastd m1, [v_weightsq+0] 990 punpcklwd m3, m4 ; left, right 991 pshufd m2, m3, q1111 992 pmaddwd m10, m8, m2 993 pmaddwd m2, m9 994 pshufd m3, m3, q0000 995 SMOOTH_2D_END 1, 1, 6, 7, 10, 2 996 vpbroadcastd m1, [v_weightsq+4] 997 pmaddwd m2, m8, m3 998 pmaddwd m3, m9 999 mova [dstq+strideq*0], m0 1000 SMOOTH_2D_END 1, 1, 6, 7, 2, 3 1001 mova [dstq+strideq*1], m0 1002 lea dstq, [dstq+strideq*2] 1003 add v_weightsq, 8 1004 sub hq, 2 1005 jg .w16_loop 1006 RET 1007.w32: 1008%assign stack_offset stack_offset - stack_size_padded 1009 WIN64_SPILL_XMM 15 1010 vpbroadcastw m0, [tlq] ; bottom 1011 movu m7, [tlq+hq*2+ 2] 1012 movu m9, [tlq+hq*2+34] 1013 mova xm10, [base+smooth_weights_2d_16bpc+32*4+16*0] 1014 mova xm11, [base+smooth_weights_2d_16bpc+32*4+16*1] 1015 vinserti128 m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1 1016 vinserti128 m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1 1017 mova xm12, [base+smooth_weights_2d_16bpc+32*4+16*4] 1018 mova xm13, [base+smooth_weights_2d_16bpc+32*4+16*5] 1019 vinserti128 m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1 1020 vinserti128 m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1 1021 punpcklwd m6, m7, m0 1022 punpckhwd m7, m0 1023 punpcklwd m8, m9, m0 1024 punpckhwd m9, m0 1025.w32_loop: 1026 vpbroadcastw m3, [tlq+hq*2-2] 1027 vpbroadcastd m14, [v_weightsq] 1028 punpcklwd m3, m4 1029 pmaddwd m1, m10, m3 1030 pmaddwd m2, m11, m3 1031 pmaddwd m0, m6, m14 1032 paddd m0, m1 1033 pmaddwd m1, m7, m14 1034 paddd m1, m2 1035 pmaddwd m2, m12, m3 1036 pmaddwd m3, m13 1037 psrld m0, 8 1038 psrld m1, 8 1039 packssdw m0, m1 1040 pavgw m0, m5 1041 mova [dstq+32*0], m0 1042 SMOOTH_2D_END 14, 14, 8, 9, 2, 3 1043 mova [dstq+32*1], m0 1044 add dstq, strideq 1045 add v_weightsq, 4 1046 dec hd 1047 jg .w32_loop 1048 RET 1049.w64: 1050%assign stack_offset stack_offset - stack_size_padded 1051 PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base 1052 mov dst_baseq, dstq 1053 mov tl_baseq, tlq 1054 mov v_weights_baseq, v_weightsq 1055 xor xq, xq 1056.w64_loop_x: 1057 mov yq, hq 1058 lea tlq, [tl_baseq+hq*2] 1059 vpbroadcastw m0, [tl_baseq] ; bottom 1060 movu m7, [tlq+xq*2+ 2] 1061 movu m9, [tlq+xq*2+34] 1062 mova xm10, [base+smooth_weights_2d_16bpc+64*4+16*0] 1063 mova xm11, [base+smooth_weights_2d_16bpc+64*4+16*1] 1064 vinserti128 m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1 1065 vinserti128 m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1 1066 mova xm12, [base+smooth_weights_2d_16bpc+64*4+16*4] 1067 mova xm13, [base+smooth_weights_2d_16bpc+64*4+16*5] 1068 vinserti128 m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1 1069 vinserti128 m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1 1070 punpcklwd m6, m7, m0 1071 punpckhwd m7, m0 1072 punpcklwd m8, m9, m0 1073 punpckhwd m9, m0 1074 lea tlq, [tl_baseq-2] 1075.w64_loop_y: 1076 vpbroadcastw m3, [tlq+yq*2] 1077 vpbroadcastd m1, [v_weightsq] 1078 punpcklwd m3, m4 1079 pmaddwd m14, m10, m3 1080 pmaddwd m15, m11, m3 1081 pmaddwd m2, m12, m3 1082 pmaddwd m3, m13 1083 pmaddwd m0, m6, m1 1084 paddd m0, m14 1085 pmaddwd m14, m7, m1 1086 paddd m14, m15 1087 psrld m0, 8 1088 psrld m14, 8 1089 packssdw m0, m14 1090 pavgw m0, m5 1091 mova [dstq+32*0], m0 1092 SMOOTH_2D_END 8, 9, 1, 1, 2, 3 1093 mova [dstq+32*1], m0 1094 add dstq, strideq 1095 add v_weightsq, 4 1096 dec yq 1097 jg .w64_loop_y 1098 lea dstq, [dst_baseq+32*2] 1099 add r6, 16*8 1100 mov v_weightsq, v_weights_baseq 1101 add xq, 32 1102 test xb, 64 1103 jz .w64_loop_x 1104 RET 1105 1106cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase 1107 %assign org_stack_offset stack_offset 1108 lea r6, [ipred_z1_16bpc_avx2_table] 1109 tzcnt wd, wm 1110 movifnidn angled, anglem 1111 movifnidn hd, hm 1112 lea r7, [dr_intra_derivative] 1113 movsxd wq, [r6+wq*4] 1114 add tlq, 2 1115 add wq, r6 1116 mov dxd, angled 1117 and dxd, 0x7e 1118 add angled, 165 ; ~90 1119 movzx dxd, word [r7+dxq] 1120 xor angled, 0x4ff ; d = 90 - angle 1121 vpbroadcastd m5, [pw_62] 1122 jmp wq 1123.w4: 1124 ALLOC_STACK -64, 7 1125 cmp angleb, 40 1126 jae .w4_no_upsample 1127 lea r3d, [angleq-1024] 1128 sar r3d, 7 1129 add r3d, hd 1130 jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) 1131 vpbroadcastw xm3, [tlq+14] 1132 movu xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 1133 palignr xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8 1134 paddw xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 1135 add dxd, dxd 1136 palignr xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8 1137 paddw xm2, xm1 ; -1 * a + 9 * b + 9 * c + -1 * d 1138 psubw xm0, xm2, xm0 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 1139 psraw xm0, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 1140 pxor xm4, xm4 1141 paddw xm2, xm0 1142 vpbroadcastw xm0, r8m ; pixel_max 1143 mova [rsp+32], xm3 1144 movd xm3, dxd 1145 pmaxsw xm2, xm4 1146 mov r3d, dxd 1147 pavgw xm2, xm4 1148 vpbroadcastw m3, xm3 1149 pminsw xm2, xm0 1150 punpcklwd xm0, xm1, xm2 1151 punpckhwd xm1, xm2 1152 lea r5, [strideq*3] 1153 pslldq m2, m3, 8 1154 mova [rsp+ 0], xm0 1155 mova [rsp+16], xm1 1156 paddw m6, m3, m3 1157 paddw m3, m2 1158 vpblendd m4, m6, 0xf0 1159 paddw m6, m6 1160 paddw m3, m4 ; xpos0 xpos1 xpos2 xpos3 1161 vbroadcasti128 m4, [z_upsample] 1162.w4_upsample_loop: 1163 lea r2d, [r3+dxq] 1164 shr r3d, 6 ; base0 1165 movu xm1, [rsp+r3*2] 1166 lea r3d, [r2+dxq] 1167 shr r2d, 6 ; base1 1168 movu xm2, [rsp+r2*2] 1169 lea r2d, [r3+dxq] 1170 shr r3d, 6 ; base2 1171 vinserti128 m1, [rsp+r3*2], 1 ; 0 2 1172 lea r3d, [r2+dxq] 1173 shr r2d, 6 ; base3 1174 vinserti128 m2, [rsp+r2*2], 1 ; 1 3 1175 pshufb m1, m4 1176 pshufb m2, m4 1177 punpcklqdq m0, m1, m2 1178 punpckhqdq m1, m2 1179 pand m2, m5, m3 ; frac 1180 psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 1181 psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) 1182 pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) 1183 paddw m3, m6 ; xpos += dx 1184 paddw m0, m1 1185 vextracti128 xm1, m0, 1 1186 movq [dstq+strideq*0], xm0 1187 movhps [dstq+strideq*1], xm0 1188 movq [dstq+strideq*2], xm1 1189 movhps [dstq+r5 ], xm1 1190 lea dstq, [dstq+strideq*4] 1191 sub hd, 4 1192 jg .w4_upsample_loop 1193 RET 1194ALIGN function_align 1195.filter_strength: ; w4/w8/w16 1196%define base r3-z_filter_t0 1197 movd xm0, maxbased 1198 lea r3, [z_filter_t0] 1199 movd xm1, angled 1200 shr angled, 8 ; is_sm << 1 1201 vpbroadcastb m0, xm0 1202 vpbroadcastb m1, xm1 1203 pcmpeqb m0, [base+z_filter_wh] 1204 mova xm2, [r3+angleq*8] 1205 pand m0, m1 1206 pcmpgtb m0, m2 1207 pmovmskb r5d, m0 1208 ret 1209.w4_no_upsample: 1210 mov maxbased, 7 1211 test angled, 0x400 ; !enable_intra_edge_filter 1212 jnz .w4_main 1213 lea maxbased, [hq+3] 1214 call .filter_strength 1215 mov maxbased, 7 1216 test r5d, r5d 1217 jz .w4_main ; filter_strength == 0 1218 popcnt r5d, r5d 1219 vpbroadcastw xm3, [tlq+14] 1220 mova xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 1221 vpbroadcastd xm1, [base+z_filter_k-4+r5*4+12*1] 1222 vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] 1223 palignr xm2, xm3, xm0, 4 ; 2 3 4 5 6 7 8 8 1224 pmullw xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 1225 paddw xm2, xm0 1226 pmullw xm2, xm4 1227 movd [rsp+16], xm3 1228 cmp r5d, 3 1229 jne .w4_3tap 1230 paddw xm1, xm2 1231 palignr xm2, xm3, xm0, 6 ; 3 4 5 6 7 8 8 8 1232 pblendw xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 1233 movzx r3d, word [tlq+14] 1234 movzx r2d, word [tlq+12] 1235 inc maxbased 1236 paddw xm2, xm0 1237 sub r2d, r3d 1238 paddw xm2, xm2 1239 lea r2d, [r2+r3*8+4] 1240 shr r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3 1241 mov [rsp+16], r2w 1242.w4_3tap: 1243 pxor xm0, xm0 1244 paddw xm1, xm2 1245 mov tlq, rsp 1246 psrlw xm1, 3 1247 cmp hd, 8 1248 sbb maxbased, -1 1249 pavgw xm0, xm1 1250 mova [tlq], xm0 1251.w4_main: 1252 movd xm3, dxd 1253 vpbroadcastq m1, [z_base_inc] 1254 vpbroadcastw m6, [tlq+maxbaseq*2] ; top[max_base_x] 1255 shl maxbased, 6 1256 vpbroadcastw m3, xm3 1257 movd xm0, maxbased 1258 mov r3d, dxd ; xpos 1259 vpbroadcastw m0, xm0 1260 paddw m4, m3, m3 1261 psubw m1, m0 ; -max_base_x 1262 vpblendd m3, m4, 0xcc 1263 paddw m0, m4, m3 1264 vpblendd m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3 1265 paddw m4, m4 1266 paddw m3, m1 1267.w4_loop: 1268 lea r5d, [r3+dxq] 1269 shr r3d, 6 ; base0 1270 movu xm1, [tlq+r3*2] 1271 lea r3d, [r5+dxq] 1272 shr r5d, 6 ; base1 1273 movu xm2, [tlq+r5*2] 1274 lea r5d, [r3+dxq] 1275 shr r3d, 6 ; base2 1276 vinserti128 m1, [tlq+r3*2], 1 ; 0 2 1277 lea r3d, [r5+dxq] 1278 shr r5d, 6 ; base3 1279 vinserti128 m2, [tlq+r5*2], 1 ; 1 3 1280 punpcklqdq m0, m1, m2 1281 psrldq m1, 2 1282 pslldq m2, 6 1283 vpblendd m1, m2, 0xcc 1284 pand m2, m5, m3 1285 psllw m2, 9 1286 psubw m1, m0 1287 pmulhrsw m1, m2 1288 psraw m2, m3, 15 ; xpos < max_base_x 1289 paddw m3, m4 1290 paddw m0, m1 1291 vpblendvb m0, m6, m0, m2 1292 vextracti128 xm1, m0, 1 1293 movq [dstq+strideq*0], xm0 1294 movhps [dstq+strideq*1], xm0 1295 lea dstq, [dstq+strideq*2] 1296 movq [dstq+strideq*0], xm1 1297 movhps [dstq+strideq*1], xm1 1298 sub hd, 4 1299 jz .w4_end 1300 lea dstq, [dstq+strideq*2] 1301 cmp r3d, maxbased 1302 jb .w4_loop 1303 lea r6, [strideq*3] 1304.w4_end_loop: 1305 movq [dstq+strideq*0], xm6 1306 movq [dstq+strideq*1], xm6 1307 movq [dstq+strideq*2], xm6 1308 movq [dstq+r6 ], xm6 1309 lea dstq, [dstq+strideq*4] 1310 sub hd, 4 1311 jg .w4_end_loop 1312.w4_end: 1313 RET 1314.w8: 1315 %assign stack_offset org_stack_offset 1316 ALLOC_STACK -64, 7 1317 lea r3d, [angleq+216] 1318 mov r3b, hb 1319 cmp r3d, 8 1320 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 1321 movu m2, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g _ 1322 movu m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g _ _ 1323 movu m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1324 cmp hd, 4 1325 jne .w8_upsample_h8 ; awkward single-pixel edge case 1326 vpblendd m0, m2, 0x20 ; 3 4 5 6 7 8 9 a b c c _ _ _ _ _ 1327.w8_upsample_h8: 1328 paddw m2, m1 1329 paddw m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1330 add dxd, dxd 1331 psubw m0, m2, m0 1332 psraw m0, 3 1333 pxor m4, m4 1334 paddw m2, m0 1335 vpbroadcastw m0, r8m 1336 movd xm3, dxd 1337 pmaxsw m2, m4 1338 mov r3d, dxd 1339 pavgw m2, m4 1340 vpbroadcastw m3, xm3 1341 pminsw m2, m0 1342 punpcklwd m0, m1, m2 1343 punpckhwd m1, m2 1344 vbroadcasti128 m4, [z_upsample] 1345 mova [rsp+ 0], xm0 1346 mova [rsp+16], xm1 1347 paddw m6, m3, m3 1348 vextracti128 [rsp+32], m0, 1 1349 vextracti128 [rsp+48], m1, 1 1350 vpblendd m3, m6, 0xf0 ; xpos0 xpos1 1351.w8_upsample_loop: 1352 lea r2d, [r3+dxq] 1353 shr r3d, 6 ; base0 1354 movu xm1, [rsp+r3*2] 1355 movu xm2, [rsp+r3*2+16] 1356 lea r3d, [r2+dxq] 1357 shr r2d, 6 ; base1 1358 vinserti128 m1, [rsp+r2*2], 1 1359 vinserti128 m2, [rsp+r2*2+16], 1 1360 pshufb m1, m4 1361 pshufb m2, m4 1362 punpcklqdq m0, m1, m2 1363 punpckhqdq m1, m2 1364 pand m2, m5, m3 1365 psllw m2, 9 1366 psubw m1, m0 1367 pmulhrsw m1, m2 1368 paddw m3, m6 1369 paddw m0, m1 1370 mova [dstq+strideq*0], xm0 1371 vextracti128 [dstq+strideq*1], m0, 1 1372 lea dstq, [dstq+strideq*2] 1373 sub hd, 2 1374 jg .w8_upsample_loop 1375 RET 1376.w8_no_intra_edge_filter: 1377 and maxbased, 7 1378 or maxbased, 8 ; imin(h+7, 15) 1379 jmp .w8_main 1380.w8_no_upsample: 1381 lea maxbased, [hq+7] 1382 test angled, 0x400 1383 jnz .w8_no_intra_edge_filter 1384 call .filter_strength 1385 test r5d, r5d 1386 jz .w8_main 1387 popcnt r5d, r5d 1388 vpbroadcastd m1, [base+z_filter_k-4+r5*4+12*1] 1389 vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] 1390 mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1391 movu m2, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1392 pmullw m1, m2 1393 cmp hd, 8 1394 jl .w8_filter_h4 1395 punpckhwd m2, m2 1396 vpblendd m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g 1397 je .w8_filter_end ; 8x4 and 8x8 are always 3-tap 1398 movzx r3d, word [tlq+30] 1399 mov maxbased, 16 1400 mov [rsp+32], r3d 1401 cmp r5d, 3 1402 jne .w8_filter_end 1403 punpcklwd xm6, xm0, xm0 1404 vpblendd m2, [tlq+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g 1405 vpblendd m6, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 1406 movzx r5d, word [tlq+28] 1407 mov [rsp+34], r3w 1408 paddw m2, m6 1409 sub r5d, r3d 1410 inc maxbased 1411 paddw m2, m2 1412 lea r3d, [r5+r3*8+4] 1413 paddw m1, m2 1414 shr r3d, 3 1415 mov [rsp+32], r3w 1416 jmp .w8_filter_end 1417.w8_filter_h4: 1418 pshuflw m3, m2, q3321 1419 vinserti128 m3, [tlq+2], 0 ; 2 3 4 5 6 7 8 9 a b c c _ _ _ _ 1420.w8_filter_end: 1421 paddw m0, m3 1422 pmullw m0, m4 1423 mov tlq, rsp 1424 pxor m2, m2 1425 paddw m0, m1 1426 psrlw m0, 3 1427 pavgw m0, m2 1428 mova [tlq], m0 1429.w8_main: 1430 movd xm3, dxd 1431 vbroadcasti128 m1, [z_base_inc] 1432 vpbroadcastw m6, [tlq+maxbaseq*2] 1433 shl maxbased, 6 1434 vpbroadcastw m3, xm3 1435 movd xm0, maxbased 1436 mov r3d, dxd 1437 vpbroadcastw m0, xm0 1438 paddw m4, m3, m3 1439 psubw m1, m0 1440 vpblendd m3, m4, 0xf0 ; xpos0 xpos1 1441 paddw m3, m1 1442.w8_loop: 1443 lea r5d, [r3+dxq] 1444 shr r3d, 6 1445 movu xm0, [tlq+r3*2] 1446 movu xm1, [tlq+r3*2+2] 1447 lea r3d, [r5+dxq] 1448 shr r5d, 6 1449 vinserti128 m0, [tlq+r5*2], 1 1450 vinserti128 m1, [tlq+r5*2+2], 1 1451 pand m2, m5, m3 1452 psllw m2, 9 1453 psubw m1, m0 1454 pmulhrsw m1, m2 1455 psraw m2, m3, 15 1456 paddw m3, m4 1457 paddw m0, m1 1458 vpblendvb m0, m6, m0, m2 1459 mova [dstq+strideq*0], xm0 1460 vextracti128 [dstq+strideq*1], m0, 1 1461 sub hd, 2 1462 jz .w8_end 1463 lea dstq, [dstq+strideq*2] 1464 cmp r3d, maxbased 1465 jb .w8_loop 1466.w8_end_loop: 1467 mova [dstq+strideq*0], xm6 1468 mova [dstq+strideq*1], xm6 1469 lea dstq, [dstq+strideq*2] 1470 sub hd, 2 1471 jg .w8_end_loop 1472.w8_end: 1473 RET 1474.w16_no_intra_edge_filter: 1475 and maxbased, 15 1476 or maxbased, 16 ; imin(h+15, 31) 1477 jmp .w16_main 1478.w16: 1479 %assign stack_offset org_stack_offset 1480 ALLOC_STACK -96, 7 1481 lea maxbased, [hq+15] 1482 test angled, 0x400 1483 jnz .w16_no_intra_edge_filter 1484 call .filter_strength 1485 test r5d, r5d 1486 jz .w16_main 1487 popcnt r5d, r5d 1488 mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1489 paddw m1, m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1490 cmp r5d, 3 1491 jne .w16_filter_3tap 1492 vpbroadcastd m2, [base+pw_3] 1493 punpcklwd xm0, xm0 1494 vpblendd m0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 1495 paddw m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1496 paddw m0, m2 1497 pavgw m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i 1498 paddw m0, m1 1499 psrlw m0, 2 1500 movu m3, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1501 paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1502 paddw m1, m3, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1503 cmp hd, 8 1504 jl .w16_filter_5tap_h4 1505 punpckhwd m3, m3 1506 je .w16_filter_5tap_h8 1507 vpblendd m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h 1508 vpblendd m3, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h 1509 movzx r3d, word [tlq+62] 1510 movzx r2d, word [tlq+60] 1511 pavgw m2, m4 1512 sub r2d, r3d 1513 paddw m1, m3 1514 lea r2d, [r2+r3*8+4] 1515 paddw m1, m2 1516 shr r2d, 3 1517 psrlw m1, 2 1518 mov [rsp+66], r3w 1519 mov [rsp+64], r2w 1520 mov tlq, rsp 1521 mov r3d, 33 1522 cmp hd, 16 1523 cmovg maxbased, r3d 1524 jmp .w16_filter_end2 1525.w16_filter_5tap_h8: 1526 vpblendd xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 1527 vpblendd xm3, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 1528 pavgw xm2, xm4 1529 paddw xm1, xm3 1530 paddw xm1, xm2 1531 psrlw xm1, 2 1532 jmp .w16_filter_end2 1533.w16_filter_5tap_h4: 1534 pshuflw xm4, xm3, q3332 ; 4 5 5 5 1535 pshuflw xm3, xm3, q3321 ; 3 4 5 5 1536 pavgw xm2, xm4 1537 paddw xm1, xm3 1538 paddw xm1, xm2 1539 psrlw xm1, 2 1540 jmp .w16_filter_end2 1541.w16_filter_3tap: 1542 vpbroadcastd m3, [base+z_filter_k-4+r5*4+12*1] 1543 vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] 1544 pmullw m0, m3, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1545 movu m2, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1546 pmullw m1, m4 1547 pmullw m3, m2 1548 paddw m0, m1 1549 cmp hd, 8 1550 je .w16_filter_3tap_h8 1551 jl .w16_filter_3tap_h4 1552 punpckhwd m2, m2 1553 vpblendd m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g 1554 jmp .w16_filter_end 1555.w16_filter_3tap_h4: 1556 pshuflw xm2, xm2, q3321 ; 2 3 4 4 _ _ _ _ 1557 jmp .w16_filter_end 1558.w16_filter_3tap_h8: 1559 psrldq xm2, 2 1560 pshufhw xm2, xm2, q2210 ; 2 3 4 5 6 7 8 8 1561.w16_filter_end: 1562 paddw m2, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1563 pmullw m2, m4 1564 psrlw m0, 3 1565 pxor m1, m1 1566 paddw m2, m3 1567 psrlw m2, 3 1568 pavgw m0, m1 1569 pavgw m1, m2 1570.w16_filter_end2: 1571 mov tlq, rsp 1572 mova [tlq+ 0], m0 1573 mova [tlq+32], m1 1574.w16_main: 1575 movd xm4, dxd 1576 vpbroadcastw m6, [tlq+maxbaseq*2] 1577 shl maxbased, 6 1578 vpbroadcastw m4, xm4 1579 movd xm0, maxbased 1580 mov r3d, dxd 1581 vpbroadcastw m0, xm0 1582 paddw m3, m4, [z_base_inc] 1583 psubw m3, m0 1584.w16_loop: 1585 lea r5d, [r3+dxq] 1586 shr r3d, 6 1587 movu m0, [tlq+r3*2] 1588 movu m1, [tlq+r3*2+2] 1589 lea r3d, [r5+dxq] 1590 shr r5d, 6 1591 pand m2, m5, m3 1592 psllw m2, 9 1593 psubw m1, m0 1594 pmulhrsw m1, m2 1595 psraw m2, m3, 15 1596 paddw m3, m4 1597 paddw m1, m0 1598 movu m0, [tlq+r5*2] 1599 vpblendvb m2, m6, m1, m2 1600 movu m1, [tlq+r5*2+2] 1601 mova [dstq+strideq*0], m2 1602 pand m2, m5, m3 1603 psllw m2, 9 1604 psubw m1, m0 1605 pmulhrsw m1, m2 1606 psraw m2, m3, 15 1607 paddw m3, m4 1608 paddw m0, m1 1609 vpblendvb m0, m6, m0, m2 1610 mova [dstq+strideq*1], m0 1611 sub hd, 2 1612 jz .w16_end 1613 lea dstq, [dstq+strideq*2] 1614 cmp r3d, maxbased 1615 jb .w16_loop 1616.w16_end_loop: 1617 mova [dstq+strideq*0], m6 1618 mova [dstq+strideq*1], m6 1619 lea dstq, [dstq+strideq*2] 1620 sub hd, 2 1621 jg .w16_end_loop 1622.w16_end: 1623 RET 1624.w32: 1625 %assign stack_offset org_stack_offset 1626 ALLOC_STACK -160, 8 1627 lea maxbased, [hq+31] 1628 mov r3d, 63 1629 cmp hd, 32 1630 cmova maxbased, r3d 1631 test angled, 0x400 1632 jnz .w32_main 1633 vpbroadcastd m2, [pw_3] 1634 mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1635 punpcklwd xm1, xm0, xm0 1636 vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 1637 paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1638 paddw m1, m2 1639 paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1640 pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i 1641 mov r3, rsp 1642 paddw m0, m1 1643 lea r5d, [maxbaseq-31] 1644 psrlw m0, 2 1645 mova [r3], m0 1646.w32_filter_loop: 1647 mova m0, [tlq+30] 1648 paddw m1, m2, [tlq+28] 1649 add tlq, 32 1650 paddw m0, [tlq+0] 1651 pavgw m1, [tlq+4] 1652 paddw m0, [tlq+2] 1653 add r3, 32 1654 paddw m0, m1 1655 psrlw m0, 2 1656 mova [r3], m0 1657 sub r5d, 16 1658 jg .w32_filter_loop 1659 movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1660 punpckhwd m1, m0, m0 1661 paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1662 paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1663 jl .w32_filter_h8 1664 vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h 1665 vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h 1666 movzx r5d, word [tlq+62] 1667 movzx r2d, word [tlq+60] 1668 pavgw m2, m3 1669 sub r2d, r5d 1670 paddw m0, m1 1671 lea r2d, [r2+r5*8+4] 1672 paddw m0, m2 1673 shr r2d, 3 1674 psrlw m0, 2 1675 mova [r3+32], m0 1676 mov [r3+66], r5w 1677 mov [r3+64], r2w 1678 mov tlq, rsp 1679 mov r3d, 65 1680 cmp hd, 64 1681 cmove maxbased, r3d 1682 jmp .w32_main 1683.w32_filter_h8: 1684 vpblendd xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 1685 vpblendd xm1, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 1686 pavgw xm2, xm3 1687 paddw xm0, xm1 1688 mov tlq, rsp 1689 paddw xm0, xm2 1690 psrlw xm0, 2 1691 mova [r3+32], xm0 1692.w32_main: 1693 movd xm4, dxd 1694 vpbroadcastw m6, [tlq+maxbaseq*2] 1695 shl maxbased, 6 1696 vpbroadcastw m4, xm4 1697 movd xm0, maxbased 1698 mov r5d, dxd 1699 vpbroadcastd m7, [pw_m1024] ; -16 * 64 1700 vpbroadcastw m0, xm0 1701 paddw m3, m4, [z_base_inc] 1702 psubw m3, m0 1703.w32_loop: 1704 mov r3d, r5d 1705 shr r3d, 6 1706 movu m0, [tlq+r3*2] 1707 movu m1, [tlq+r3*2+2] 1708 pand m2, m5, m3 1709 psllw m2, 9 1710 psubw m1, m0 1711 pmulhrsw m1, m2 1712 paddw m0, m1 1713 psraw m1, m3, 15 1714 vpblendvb m0, m6, m0, m1 1715 mova [dstq+32*0], m0 1716 movu m0, [tlq+r3*2+32] 1717 movu m1, [tlq+r3*2+34] 1718 add r5d, dxd 1719 psubw m1, m0 1720 pmulhrsw m1, m2 1721 pcmpgtw m2, m7, m3 1722 paddw m3, m4 1723 paddw m0, m1 1724 vpblendvb m0, m6, m0, m2 1725 mova [dstq+32*1], m0 1726 dec hd 1727 jz .w32_end 1728 add dstq, strideq 1729 cmp r5d, maxbased 1730 jb .w32_loop 1731.w32_end_loop: 1732 mova [dstq+32*0], m6 1733 mova [dstq+32*1], m6 1734 add dstq, strideq 1735 dec hd 1736 jg .w32_end_loop 1737.w32_end: 1738 RET 1739.w64: 1740 %assign stack_offset org_stack_offset 1741 ALLOC_STACK -256, 10 1742 lea maxbased, [hq+63] 1743 test angled, 0x400 1744 jnz .w64_main 1745 vpbroadcastd m2, [pw_3] 1746 mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1747 punpcklwd xm1, xm0, xm0 1748 vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 1749 paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1750 paddw m1, m2 1751 paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1752 pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i 1753 mov r3, rsp 1754 paddw m0, m1 1755 lea r5d, [hq+32] 1756 psrlw m0, 2 1757 mova [r3], m0 1758.w64_filter_loop: 1759 mova m0, [tlq+30] 1760 paddw m1, m2, [tlq+28] 1761 add tlq, 32 1762 paddw m0, [tlq+0] 1763 pavgw m1, [tlq+4] 1764 paddw m0, [tlq+2] 1765 add r3, 32 1766 paddw m0, m1 1767 psrlw m0, 2 1768 mova [r3], m0 1769 sub r5d, 16 1770 jg .w64_filter_loop 1771 movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h 1772 punpckhwd m1, m0, m0 1773 paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 1774 paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 1775 vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h 1776 vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h 1777 pavgw m2, m3 1778 paddw m0, m1 1779 paddw m0, m2 1780 mov tlq, rsp 1781 psrlw m0, 2 1782 mova [r3+32], m0 1783.w64_main: 1784 movd xm4, dxd 1785 vpbroadcastw m6, [tlq+maxbaseq*2] 1786 shl maxbased, 6 1787 vpbroadcastw m4, xm4 1788 movd xm0, maxbased 1789 mov r5d, dxd 1790 vpbroadcastd m7, [pw_m1024] ; -16 * 64 1791 vpbroadcastw m0, xm0 1792 paddw m3, m4, [z_base_inc] 1793 paddw m8, m7, m7 ; -32 * 64 1794 psubw m3, m0 1795 paddw m9, m8, m7 ; -48 * 64 1796.w64_loop: 1797 mov r3d, r5d 1798 shr r3d, 6 1799 movu m0, [tlq+r3*2] 1800 movu m1, [tlq+r3*2+2] 1801 pand m2, m5, m3 1802 psllw m2, 9 1803 psubw m1, m0 1804 pmulhrsw m1, m2 1805 paddw m0, m1 1806 psraw m1, m3, 15 1807 vpblendvb m0, m6, m0, m1 1808 mova [dstq+32*0], m0 1809 movu m0, [tlq+r3*2+32] 1810 movu m1, [tlq+r3*2+34] 1811 psubw m1, m0 1812 pmulhrsw m1, m2 1813 paddw m0, m1 1814 pcmpgtw m1, m7, m3 1815 vpblendvb m0, m6, m0, m1 1816 mova [dstq+32*1], m0 1817 movu m0, [tlq+r3*2+64] 1818 movu m1, [tlq+r3*2+66] 1819 psubw m1, m0 1820 pmulhrsw m1, m2 1821 paddw m0, m1 1822 pcmpgtw m1, m8, m3 1823 vpblendvb m0, m6, m0, m1 1824 mova [dstq+32*2], m0 1825 movu m0, [tlq+r3*2+96] 1826 movu m1, [tlq+r3*2+98] 1827 add r5d, dxd 1828 psubw m1, m0 1829 pmulhrsw m1, m2 1830 pcmpgtw m2, m9, m3 1831 paddw m3, m4 1832 paddw m0, m1 1833 vpblendvb m0, m6, m0, m2 1834 mova [dstq+32*3], m0 1835 dec hd 1836 jz .w64_end 1837 add dstq, strideq 1838 cmp r5d, maxbased 1839 jb .w64_loop 1840.w64_end_loop: 1841 mova [dstq+32*0], m6 1842 mova [dstq+32*1], m6 1843 mova [dstq+32*2], m6 1844 mova [dstq+32*3], m6 1845 add dstq, strideq 1846 dec hd 1847 jg .w64_end_loop 1848.w64_end: 1849 RET 1850 1851cglobal ipred_z2_16bpc, 3, 12, 12, 352, dst, stride, tl, w, h, angle, dx, dy 1852%define base r9-z_filter_t0 1853 lea r9, [ipred_z2_16bpc_avx2_table] 1854 tzcnt wd, wm 1855 movifnidn angled, anglem 1856 movifnidn hd, hm 1857 lea dxq, [dr_intra_derivative-90] 1858 movsxd wq, [r9+wq*4] 1859 mova m1, [tlq- 0] 1860 movzx dyd, angleb 1861 xor angled, 0x400 1862 mova m2, [tlq- 32] 1863 mov r8, dxq 1864 sub dxq, dyq 1865 mova m3, [tlq- 64] 1866 add wq, r9 1867 add r9, z_filter_t0-ipred_z2_16bpc_avx2_table 1868 mova m4, [tlq- 96] 1869 and dyd, ~1 1870 mova m5, [tlq-128] 1871 and dxq, ~1 1872 movzx dyd, word [r8+dyq] ; angle - 90 1873 movzx dxd, word [dxq+270] ; 180 - angle 1874 vpbroadcastd m11, [base+pw_62] 1875 mova [rsp+128], m1 1876 mova [rsp+ 96], m2 1877 mova [rsp+ 64], m3 1878 neg dxd 1879 mova [rsp+ 32], m4 1880 neg dyq 1881 mova [rsp+ 0], m5 1882 jmp wq 1883.w4: 1884 vbroadcasti128 m10, [base+z2_x_shuf] 1885 vpbroadcastq m6, [base+z_base_inc+2] 1886 lea r8d, [dxq+(65<<6)] ; xpos 1887 mov r10d, (63-4)<<6 1888 test angled, 0x400 1889 jnz .w4_main ; !enable_intra_edge_filter 1890 lea r3d, [hq+2] 1891 add angled, 1022 1892 shl r3d, 6 1893 test r3d, angled 1894 jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) 1895 movq xm0, [tlq+2] ; 1 2 3 4 1896 movq xm1, [tlq+0] ; 0 1 2 3 1897 pshuflw xm2, xm0, q3321 ; 2 3 4 4 1898 pshuflw xm3, xm1, q2100 ; 0 0 1 2 1899 vpbroadcastw xm4, r8m ; pixel_max 1900 vbroadcasti128 m10, [base+z_upsample] 1901 paddw xm1, xm0 1902 paddw xm2, xm3 1903 lea r8d, [r8+dxq+(1<<6)] 1904 psubw xm2, xm1, xm2 1905 add dxd, dxd 1906 psraw xm2, 3 1907 pxor xm3, xm3 1908 sub r10d, 3<<6 1909 paddw xm1, xm2 1910 paddw m6, m6 1911 pmaxsw xm1, xm3 1912 sub angled, 1075 ; angle - 53 1913 pavgw xm1, xm3 1914 lea r3d, [hq+3] 1915 pminsw xm1, xm4 1916 xor angled, 0x7f ; 180 - angle 1917 punpcklwd xm1, xm0 1918 movu [rsp+130], xm1 1919 call .filter_strength 1920 jmp .w4_filter_left 1921ALIGN function_align 1922.filter_strength: 1923 movd xm8, r3d 1924 mov r3d, angled 1925 movd xm7, angled 1926 vpbroadcastb m8, xm8 1927 shr r3d, 8 ; is_sm << 1 1928 vpbroadcastb m7, xm7 1929 pcmpeqb m8, [base+z_filter_wh] 1930 mova xm9, [r9+r3*8] 1931 pand m0, m8, m7 1932 pcmpgtb m0, m9 1933 pmovmskb r3d, m0 1934 ret 1935ALIGN function_align 1936.upsample_left: ; h4/h8 1937 mova xm0, [tlq-16] ; 8 7 6 5 4 3 2 1 1938 movu xm1, [tlq-14] ; 7 6 5 4 3 2 1 0 1939%if STACK_ALIGNMENT < 32 1940 vpbroadcastw xm4, r8m ; pixel_max 1941%else 1942 vpbroadcastw xm4, r9m ; r8m -> r9m due to call 1943%endif 1944 cmp hd, 8 1945 je .upsample_left_h8 1946 pshufhw xm2, xm0, q2100 ; _ _ _ _ 4 4 3 2 1947 pshufhw xm3, xm1, q3321 ; _ _ _ _ 2 1 0 0 1948 jmp .upsample_left_end 1949.upsample_left_h8: 1950 pblendw xm2, xm0, [tlq-18], 0xfe ; 8 8 7 6 5 4 3 2 1951 pblendw xm3, xm1, [tlq-12], 0x7f ; 6 5 4 3 2 1 0 0 1952.upsample_left_end: 1953 paddw xm1, xm0 1954 paddw xm2, xm3 1955 psubw xm2, xm1, xm2 1956 add dyq, dyq 1957 psraw xm2, 3 1958 pxor xm3, xm3 1959 paddw xm1, xm2 1960 pmaxsw xm1, xm3 1961 pavgw xm1, xm3 1962 pminsw xm1, xm4 1963 punpcklwd xm2, xm0, xm1 1964 punpckhwd xm0, xm1 1965 mova [rsp+ 96+gprsize], xm2 1966 mova [rsp+112+gprsize], xm0 1967 ret 1968.w4_no_upsample_above: 1969 lea r3d, [hq+3] 1970 sub angled, 1112 ; angle - 90 1971 call .filter_strength 1972 test r3d, r3d 1973 jz .w4_no_filter_above 1974 popcnt r3d, r3d 1975 vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] 1976 vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] 1977 psrldq xm0, xm1, 2 ; 1 2 3 4 1978 pshuflw xm2, xm1, q2100 ; 0 0 1 2 1979 pmullw xm4, xm0 1980 pshuflw xm3, xm0, q3321 ; 2 3 4 4 1981 paddw xm1, xm3 1982 pshuflw xm3, xm0, q3332 ; 3 4 4 4 1983 pmullw xm1, xm5 1984 vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*2] 1985 paddw xm2, xm3 1986 vpbroadcastd xm3, r6m ; max_width 1987 pmullw xm2, xm5 1988 packssdw xm3, xm3 1989 paddw xm1, xm4 1990 paddw xm1, xm2 1991 psubw xm3, [base+pw_1to16] 1992 pxor xm4, xm4 1993 psrlw xm1, 3 1994 pminsw xm3, xm11 ; clip to byte range since there's no variable word blend 1995 pavgw xm1, xm4 1996 vpblendvb xm1, xm0, xm3 1997 movq [rsp+130], xm1 1998.w4_no_filter_above: 1999 lea r3d, [hq+2] 2000 add angled, 973 ; angle + 883 2001 shl r3d, 6 2002 test r3d, angled 2003 jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) 2004 vpbroadcastd xm0, [base+pb_90] 2005 psubb xm0, xm7 ; 180 - angle 2006 pand xm0, xm8 ; reuse from previous filter_strength call 2007 pcmpgtb xm0, xm9 2008 pmovmskb r3d, xm0 2009.w4_filter_left: 2010 test r3d, r3d 2011 jz .w4_main 2012 popcnt r3d, r3d 2013 mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2014 vpbroadcastd m5, r7m ; max_height 2015 cmp r3d, 3 2016 je .w4_filter_left_s3 2017 vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] 2018 vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] 2019 pmullw m2, m0 2020 cmp hd, 8 2021 jl .w4_filter_left_h4 2022 movu m4, [tlq-34] 2023 punpcklwd m1, m0, m0 2024 vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e 2025 je .w4_filter_left_end 2026 vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 2027 jmp .w4_filter_left_end 2028.w4_upsample_left: 2029 call .upsample_left 2030 mov r11, -16 2031 vbroadcasti128 m9, [base+z_upsample] 2032 jmp .w4_main_upsample_left 2033.w4_filter_left_s3: ; can only be h16 2034 movu m2, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2035 vpbroadcastd m4, [base+pw_3] 2036 paddw m1, m0, m2 2037 punpckhwd m2, m2 2038 vpblendd m2, [tlq-28], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g 2039 punpcklwd xm3, xm0, xm0 2040 paddw m2, m4 2041 vpblendd m4, m3, [tlq-34], 0xfe ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e 2042 vpblendd m3, [tlq-36], 0xfe ; 0 0 0 1 2 3 4 5 6 8 8 9 a b c d 2043 paddw m1, m4 2044 pavgw m2, m3 2045 paddw m1, m2 2046 psrlw m1, 2 2047 jmp .w4_filter_left_end2 2048.w4_filter_left_h4: 2049 pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e 2050.w4_filter_left_end: 2051 paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2052 pmullw m1, m3 2053 paddw m1, m2 2054 pxor m2, m2 2055 psrlw m1, 3 2056 pavgw m1, m2 2057.w4_filter_left_end2: 2058 packssdw m5, m5 2059 psubw m5, [base+pw_16to1] 2060 pminsw m5, m11 2061 vpblendvb m1, m0, m5 2062 mova [rsp+96], m1 2063.w4_main: 2064 vbroadcasti128 m9, [base+z2_x_shuf] 2065 mov r11, -8 2066.w4_main_upsample_left: 2067 movd xm5, dyd 2068 mova m4, [base+z2_y_shuf_h4] 2069 mov r2d, r8d 2070 movd xm0, dxd 2071 vpbroadcastw m5, xm5 2072 rorx r5, dyq, 5 2073 lea r8d, [dyq*3] 2074 pmullw m5, [base+z2_ymul] 2075 rorx r9, dyq, 4 2076 sar dyd, 6 2077 vpbroadcastw m0, xm0 2078 sar r8d, 6 2079 pand m5, m11 ; frac_y 2080 neg dyd 2081 psllw m5, 9 2082 add r5d, dyd 2083 add r8d, dyd 2084 add r9d, dyd 2085 paddw m7, m0, m0 2086 lea dyq, [rsp+dyq*2+126] 2087 vpblendd m0, m7, 0xcc 2088 add dyq, r11 2089 neg r5d 2090 paddw m1, m0, m7 2091 neg r8d 2092 vpblendd m0, m1, 0xf0 ; xpos0 xpos1 xpos2 xpos3 2093 neg r9d 2094 paddw m7, m7 2095 paddw m6, m0 2096.w4_loop: 2097 lea r3d, [r2+dxq] 2098 shr r2d, 6 ; base_x0 2099 movu xm1, [rsp+r2*2] 2100 lea r2d, [r3+dxq] 2101 shr r3d, 6 ; base_x1 2102 movu xm3, [rsp+r3*2] 2103 lea r3d, [r2+dxq] 2104 shr r2d, 6 ; base_x2 2105 vinserti128 m1, [rsp+r2*2], 1 2106 lea r2d, [r3+dxq] 2107 shr r3d, 6 ; base_x3 2108 vinserti128 m3, [rsp+r3*2], 1 2109 pshufb m1, m10 ; a0 a1 a2 a3 A0 A1 A2 A3 2110 pshufb m3, m10 ; b0 b1 b2 b3 B0 B1 B2 B3 2111 pand m2, m11, m6 2112 punpcklqdq m0, m1, m3 2113 punpckhqdq m1, m3 2114 psllw m2, 9 2115 psubw m1, m0 2116 pmulhrsw m1, m2 2117 paddw m0, m1 2118 cmp r3d, 64 2119 jge .w4_toponly 2120 movu xm2, [dyq] 2121 vinserti128 m2, [dyq+r8*2], 1 2122 movu xm3, [dyq+r5*2] 2123 vinserti128 m3, [dyq+r9*2], 1 2124 pshufb m2, m9 2125 pshufb m3, m9 2126 punpckhwd m1, m2, m3 ; a3 b3 a2 b2 a1 b1 a0 b0 2127 punpcklwd m2, m3 2128 psubw m2, m1 2129 pmulhrsw m2, m5 2130 psraw m3, m6, 15 ; base_x < topleft 2131 paddw m1, m2 2132 vpermd m1, m4, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 2133 vpblendvb m0, m1, m3 2134.w4_toponly: 2135 paddw m6, m7 ; xpos += dx 2136 lea r3, [strideq*3] 2137 add dyq, r11 2138 vextracti128 xm1, m0, 1 2139 movq [dstq+strideq*0], xm0 2140 movhps [dstq+strideq*1], xm0 2141 movq [dstq+strideq*2], xm1 2142 movhps [dstq+r3 ], xm1 2143 sub hd, 4 2144 jz .w4_end 2145 lea dstq, [dstq+strideq*4] 2146 cmp r2d, r10d 2147 jge .w4_loop 2148.w4_leftonly_loop: 2149 movu xm1, [dyq] 2150 vinserti128 m1, [dyq+r8*2], 1 2151 movu xm2, [dyq+r5*2] 2152 vinserti128 m2, [dyq+r9*2], 1 2153 add dyq, r11 2154 pshufb m1, m9 2155 pshufb m2, m9 2156 punpckhwd m0, m1, m2 2157 punpcklwd m1, m2 2158 psubw m1, m0 2159 pmulhrsw m1, m5 2160 paddw m0, m1 2161 vpermd m0, m4, m0 2162 vextracti128 xm1, m0, 1 2163 movq [dstq+strideq*0], xm0 2164 movhps [dstq+strideq*1], xm0 2165 movq [dstq+strideq*2], xm1 2166 movhps [dstq+r3 ], xm1 2167 lea dstq, [dstq+strideq*4] 2168 sub hd, 4 2169 jg .w4_leftonly_loop 2170.w4_end: 2171 RET 2172.w8: 2173 mov r10d, hd 2174 test angled, 0x400 2175 jnz .w8_main 2176 lea r3d, [angleq+126] 2177 xor r8d, r8d 2178 mov r3b, hb 2179 cmp r3d, 8 2180 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm 2181 movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 2182 mova xm1, [tlq+0] ; 0 1 2 3 4 5 6 7 2183 pblendw xm2, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 2184 pblendw xm3, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 2185 vpbroadcastw xm4, r8m ; pixel_max 2186 paddw xm1, xm0 2187 paddw xm2, xm3 2188 not r8d 2189 psubw xm2, xm1, xm2 2190 add dxd, dxd 2191 psraw xm2, 3 2192 sub angled, 53 ; angle - 53 2193 pxor xm3, xm3 2194 paddw xm2, xm1 2195 lea r3d, [hq+7] 2196 pmaxsw xm2, xm3 2197 xor angled, 0x7f ; 180 - angle 2198 pavgw xm2, xm3 2199 pminsw xm2, xm4 2200 punpcklwd xm1, xm2, xm0 2201 punpckhwd xm2, xm0 2202 movu [rsp+130], xm1 2203 movu [rsp+146], xm2 2204 call .filter_strength 2205 jmp .w8_filter_left 2206.w8_no_upsample_above: 2207 lea r3d, [hq+7] 2208 sub angled, 90 ; angle - 90 2209 call .filter_strength 2210 test r3d, r3d 2211 jz .w8_no_filter_above 2212 popcnt r3d, r3d 2213 vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] 2214 vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] 2215 vpbroadcastd xm6, [base+z_filter_k-4+r3*4+12*2] 2216 movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 x 2217 pblendw xm2, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 x 2218 pmullw xm4, xm0 2219 pblendw xm3, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 x 2220 paddw xm1, xm3 2221 vpblendd xm3, [tlq+6], 0x07 ; 3 4 5 6 7 8 8 8 x 2222 paddw xm2, xm3 2223 vpbroadcastd xm3, r6m ; max_width 2224 pmullw xm1, xm5 2225 pmullw xm2, xm6 2226 packssdw xm3, xm3 2227 paddw xm1, xm4 2228 paddw xm1, xm2 2229 psubw xm3, [base+pw_1to16] 2230 pxor xm4, xm4 2231 psrlw xm1, 3 2232 pminsw xm3, xm11 2233 pavgw xm1, xm4 2234 vpblendvb xm1, xm0, xm3 2235 movu [rsp+130], xm1 2236.w8_no_filter_above: 2237 lea r3d, [angleq-51] 2238 mov r3b, hb 2239 cmp r3d, 8 2240 jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm 2241 vpbroadcastd m0, [base+pb_90] 2242 psubb m0, m7 2243 pand m0, m8 2244 pcmpgtb m0, m9 2245 pmovmskb r3d, m0 2246.w8_filter_left: 2247 test r3d, r3d 2248 jz .w8_main 2249 popcnt r3d, r3d 2250 cmp r3d, 3 2251 jne .w8_filter_left_s12 2252 vpbroadcastd m6, [base+pw_3] 2253 vpbroadcastd m7, [base+pw_16] 2254 cmp hd, 16 ; flags needed for later 2255 jmp .filter_left_s3b 2256.w8_upsample_left: 2257 call .upsample_left 2258 vbroadcasti128 m7, [base+z2_y_shuf_us] 2259 lea r11, [rsp+118] 2260 mov r8, -8 2261 jmp .w8_main_upsample_left 2262.w16_filter_left_s12: 2263 xor r8d, r8d 2264.w8_filter_left_s12: 2265 mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2266 vpbroadcastd m5, r7m ; max_height 2267 vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] 2268 vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] 2269 pmullw m2, m0 2270 cmp hd, 8 2271 jl .w8_filter_left_h4 2272 movu m4, [tlq-34] 2273 punpcklwd m1, m0, m0 2274 vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e 2275 je .w8_filter_left_end 2276 vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 2277 jmp .w8_filter_left_end 2278.w8_filter_left_h4: 2279 pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e 2280.w8_filter_left_end: 2281 paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2282 pmullw m1, m3 2283 paddw m1, m2 2284 pxor m2, m2 2285 psrlw m1, 3 2286 pavgw m1, m2 2287 packssdw m5, m5 2288 psubw m5, [base+pw_16to1] 2289 pminsw m5, m11 2290 vpblendvb m1, m0, m5 2291 mova [rsp+96], m1 2292 test r8d, r8d 2293 jz .w8_main 2294; upsample_main 2295 vbroadcasti128 m10, [base+z_upsample] 2296 vbroadcasti128 m7, [base+z2_y_shuf] 2297 lea r5, [rsp+120] 2298 movd xm1, dyd 2299 vbroadcasti128 m4, [base+z_base_inc+2] 2300 movd xm2, dxd 2301 vpbroadcastw m1, xm1 2302 vpbroadcastw m2, xm2 2303 mov r7, dstq 2304 paddw m4, m4 2305 pmullw m0, m1, [base+z2_ymul8] 2306 paddw m5, m2, m2 2307 psllw xm1, 3 2308 vpblendd m2, m5, 0xf0 2309 lea r2d, [dxq+(66<<6)] ; xpos 2310 paddw m4, m2 2311 pshufd m6, m0, q2020 2312 psraw xm0, 6 2313 pxor xm1, xm1 2314 psubw xm8, xm1, xm0 2315 pand m6, m11 2316 punpckhwd xm9, xm8, xm1 2317 psllw m6, 9 2318 punpcklwd xm8, xm1 2319.w8_upsample_above_loop: 2320 lea r3d, [r2+dxq] 2321 shr r2d, 6 2322 movu xm1, [rsp+r2*2] 2323 movu xm2, [rsp+r2*2+16] 2324 lea r2d, [r3+dxq] 2325 shr r3d, 6 2326 vinserti128 m1, [rsp+r3*2], 1 2327 vinserti128 m2, [rsp+r3*2+16], 1 2328 pshufb m1, m10 2329 pshufb m2, m10 2330 punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 2331 punpckhqdq m1, m2 2332 pand m2, m11, m4 2333 psubw m1, m0 2334 psllw m2, 9 2335 pmulhrsw m1, m2 2336 paddw m0, m1 2337 cmp r3d, 64 2338 jge .w8_upsample_above_toponly 2339 mova m1, m5 2340 vpgatherdq m3, [r5+xm9*2], m5 2341 mova m5, m1 2342 vpgatherdq m2, [r5+xm8*2], m1 2343 pshufb m3, m7 2344 pshufb m2, m7 2345 punpckldq m1, m2, m3 2346 punpckhdq m2, m3 2347 psubw m2, m1 2348 pmulhrsw m2, m6 2349 paddw m1, m2 2350 vpermq m1, m1, q3120 2351 psraw m2, m4, 15 2352 vpblendvb m0, m1, m2 2353.w8_upsample_above_toponly: 2354 paddw m4, m5 2355 sub r5, 4 2356 mova [dstq+strideq*0], xm0 2357 vextracti128 [dstq+strideq*1], m0, 1 2358 sub hd, 2 2359 jz .w8_ret 2360 lea dstq, [dstq+strideq*2] 2361 jmp .w8_upsample_above_loop 2362.w8_main: 2363 vbroadcasti128 m7, [base+z2_y_shuf] 2364 lea r11, [rsp+120] 2365 mov r8, -4 2366.w8_main_upsample_left: 2367 movd xm1, dyd 2368 vbroadcasti128 m4, [base+z_base_inc+2] 2369 movd xm2, dxd 2370 vpbroadcastw m1, xm1 2371 vpbroadcastw m2, xm2 2372 mov r7, dstq 2373 pmullw m0, m1, [base+z2_ymul8] 2374 paddw m5, m2, m2 2375 psllw xm1, 3 2376 vpblendd m2, m5, 0xf0 ; xpos0 xpos1 2377 lea r9d, [dxq+(65<<6)] ; xpos 2378 paddw m4, m2 2379 movd [rsp+284], xm1 2380.w8_loop0: 2381 mov r2d, r9d 2382 mova [rsp+288], m0 2383 mov r5, r11 2384 mova [rsp+320], m4 2385 pshufd m6, m0, q2020 2386 psraw xm0, 6 2387 pxor xm1, xm1 2388 psubw xm8, xm1, xm0 ; base_y 2389 pand m6, m11 ; frac_y 2390 punpckhwd xm9, xm8, xm1 ; base_y 2 3 6 7 2391 psllw m6, 9 2392 punpcklwd xm8, xm1 ; base_y 0 1 4 5 2393.w8_loop: 2394 lea r3d, [r2+dxq] 2395 shr r2d, 6 ; base_x0 2396 movu xm0, [rsp+r2*2] 2397 movu xm1, [rsp+r2*2+2] 2398 lea r2d, [r3+dxq] 2399 shr r3d, 6 ; base_x1 2400 vinserti128 m0, [rsp+r3*2], 1 2401 vinserti128 m1, [rsp+r3*2+2], 1 2402 pand m2, m11, m4 2403 psubw m1, m0 2404 psllw m2, 9 2405 pmulhrsw m1, m2 2406 paddw m0, m1 2407 cmp r3d, 64 2408 jge .w8_toponly 2409 mova m1, m5 2410 vpgatherdq m3, [r5+xm9*2], m5 2411 mova m5, m1 2412 vpgatherdq m2, [r5+xm8*2], m1 2413 pshufb m3, m7 ; c0 d0 c1 d1 g0 h0 g1 h1 2414 pshufb m2, m7 ; a0 b0 a1 b1 e0 f0 e1 f1 2415 punpckldq m1, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 2416 punpckhdq m2, m3 2417 psubw m2, m1 2418 pmulhrsw m2, m6 2419 paddw m1, m2 2420 vpermq m1, m1, q3120 2421 psraw m2, m4, 15 ; base_x < topleft 2422 vpblendvb m0, m1, m2 2423.w8_toponly: 2424 paddw m4, m5 ; xpos += dx 2425 add r5, r8 2426 mova [dstq+strideq*0], xm0 2427 vextracti128 [dstq+strideq*1], m0, 1 2428 sub hd, 2 2429 jz .w8_end 2430 lea dstq, [dstq+strideq*2] 2431 cmp r2d, (63-8)<<6 2432 jge .w8_loop 2433.w8_leftonly_loop: 2434 mova m0, m5 2435 vpgatherdq m4, [r5+xm9*2], m5 2436 mova m5, m0 2437 vpgatherdq m3, [r5+xm8*2], m0 2438 add r5, r8 2439 pshufb m2, m4, m7 2440 pshufb m1, m3, m7 2441 punpckldq m0, m1, m2 2442 punpckhdq m1, m2 2443 psubw m1, m0 2444 pmulhrsw m1, m6 2445 paddw m0, m1 2446 vpermq m0, m0, q3120 2447 mova [dstq+strideq*0], xm0 2448 vextracti128 [dstq+strideq*1], m0, 1 2449 lea dstq, [dstq+strideq*2] 2450 sub hd, 2 2451 jg .w8_leftonly_loop 2452.w8_end: 2453 sub r10d, 1<<8 2454 jl .w8_ret 2455 vpbroadcastd m0, [rsp+284] 2456 add r7, 16 2457 paddw m0, [rsp+288] ; base_y += 8*dy 2458 add r9d, 8<<6 2459 vpbroadcastd m4, [pw_512] 2460 movzx hd, r10b 2461 paddw m4, [rsp+320] ; base_x += 8*64 2462 mov dstq, r7 2463 jmp .w8_loop0 2464.w8_ret: 2465 RET 2466.w16: 2467 movd xm0, [tlq+32] 2468 lea r10d, [hq+(1<<8)] 2469 movd [rsp+160], xm0 2470 test angled, 0x400 2471 jnz .w8_main 2472 lea r3d, [hq+15] 2473 sub angled, 90 2474 call .filter_strength 2475 test r3d, r3d 2476 jz .w16_no_filter_above 2477 popcnt r3d, r3d 2478 vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] 2479 vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*0] 2480 vpbroadcastd m6, [base+z_filter_k-4+r3*4+12*2] 2481 movu m0, [tlq+2] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2482 punpcklwd xm2, xm1, xm1 2483 vpblendd m2, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 2484 punpckhwd m3, m0, m0 2485 pmullw m4, m0 2486 vpblendd m3, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g 2487 paddw m1, m3 2488 vpblendd m3, [tlq+6], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g 2489 paddw m2, m3 2490 vpbroadcastd m3, r6m ; max_width 2491 pmullw m1, m5 2492 pmullw m2, m6 2493 packssdw m3, m3 2494 paddw m1, m4 2495 paddw m1, m2 2496 psubw m3, [base+pw_1to16] 2497 pxor m4, m4 2498 psrlw m1, 3 2499 pminsw m3, m11 2500 pavgw m1, m4 2501 vpblendvb m1, m0, m3 2502 movu [rsp+130], m1 2503.w16_no_filter_above: 2504 vpbroadcastd m0, [base+pb_90] 2505 psubb m0, m7 2506 pand m0, m8 2507 pcmpgtb m0, m9 2508 pmovmskb r3d, m0 2509 test r3d, r3d 2510 jz .w8_main 2511 popcnt r3d, r3d 2512 cmp r3d, 3 2513 jne .w16_filter_left_s12 2514 vpbroadcastd m6, [base+pw_3] 2515 vpbroadcastd m7, [base+pw_16] 2516 cmp hd, 4 2517 jne .filter_left_s3 2518 movq xm0, [tlq-8] ; 0 1 2 3 2519 movq xm1, [tlq-6] ; 1 2 3 4 2520 vpbroadcastd xm5, r7m ; max_height 2521 movq xm4, [base+pw_16to1+24] ; 4to1 2522 pshuflw xm2, xm0, q2100 ; 0 0 1 2 2523 pshuflw xm3, xm1, q3321 ; 2 3 4 4 2524 paddw xm1, xm0 2525 paddw xm1, xm2 2526 pshuflw xm2, xm0, q1000 ; 0 0 0 1 2527 paddw xm3, xm6 2528 packssdw xm5, xm5 2529 pavgw xm2, xm3 2530 psubw xm5, xm4 2531 paddw xm1, xm2 2532 pminsw xm5, xm11 2533 psrlw xm1, 2 2534 vpblendvb xm1, xm0, xm5 2535 movq [rsp+120], xm1 2536 jmp .w8_main 2537.w32: 2538 mova m2, [tlq+32] 2539 movd xm0, [tlq+64] 2540 lea r10d, [hq+(3<<8)] 2541 mova [rsp+160], m2 2542 movd [rsp+192], xm0 2543 test angled, 0x400 2544 jnz .w8_main 2545 vpbroadcastd m6, [base+pw_3] 2546 vpbroadcastd m0, r6m ; max_width 2547 vpbroadcastd m7, [base+pw_16] 2548 mov r3d, 32 2549 packssdw m0, m0 2550 psubw m0, [base+pw_1to16] 2551 pminsw m8, m0, m11 2552 psubw m9, m8, m7 2553.w32_filter_above: 2554 movu m0, [tlq+2] 2555 punpcklwd xm4, xm1, xm1 2556 paddw m2, m6, [tlq+6] 2557 paddw m1, m0 2558 vpblendd m4, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 2559 paddw m1, [tlq+4] 2560 movu m3, [tlq+r3+2] 2561 paddw m5, m6, [tlq+r3-2] 2562 pavgw m2, m4 2563 punpckhwd m4, m3, m3 2564 paddw m1, m2 2565 vpblendd m2, m4, [tlq+r3+6], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h 2566 vpblendd m4, [tlq+r3+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h 2567 pavgw m2, m5 2568 paddw m5, m3, [tlq+r3] 2569 paddw m4, m5 2570 psrlw m1, 2 2571 paddw m2, m4 2572 vpblendvb m1, m0, m8 2573 psrlw m2, 2 2574 vpblendvb m2, m3, m9 2575 movu [rsp+130], m1 2576 movu [rsp+r3+130], m2 2577.filter_left_s3: 2578 cmp hd, 16 2579 jl .filter_left_s3_h8 ; h8 2580.filter_left_s3b: 2581 mova m0, [tlq-32] ; 2 3 4 5 6 7 8 9 a b c d e f g h 2582 movu m2, [tlq-30] ; 3 4 5 6 7 8 9 a b c d e f g h i 2583 vpbroadcastd m5, r7m ; max_height 2584 paddw m1, m0, m2 2585 punpckhwd m2, m2 2586 mov r3d, hd 2587 vpblendd m2, [tlq-28], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i 2588 packssdw m5, m5 2589 not r3 2590 psubw m5, [base+pw_16to1] 2591 paddw m2, m6 2592 pminsw m8, m11, m5 2593 je .filter_left_s3_end ; h16 2594 paddw m1, [tlq-34] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2595 pavgw m2, [tlq-36] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2596 paddw m1, m2 2597 psrlw m1, 2 2598 vpblendvb m3, m1, m0, m8 2599 mova m0, [tlq-64] ; 2 3 4 5 6 7 8 9 a b c d e f g h 2600 paddw m1, m0, [tlq-62] ; 3 4 5 6 7 8 9 a b c d e f g h i 2601 paddw m2, m6, [tlq-60] ; 4 5 6 7 8 9 a b c d e f g h i j 2602 psubw m8, m7 2603 mova [rsp+96], m3 2604 jnp .filter_left_s3_end ; h32 2605 mova m5, [tlq-96] 2606 paddw m1, [tlq-66] 2607 pavgw m2, [tlq-68] 2608 paddw m1, m2 2609 paddw m4, m5, [tlq-94] 2610 paddw m2, m6, [tlq-92] 2611 psrlw m1, 2 2612 paddw m4, [tlq- 98] 2613 pavgw m2, [tlq-100] 2614 vpblendvb m3, m1, m0, m8 2615 mova m0, [tlq-128] 2616 psubw m8, m7 2617 paddw m4, m2 2618 paddw m1, m0, [tlq-126] 2619 paddw m2, m6, [tlq-124] 2620 psrlw m4, 2 2621 mova [rsp+64], m3 2622 vpblendvb m4, m5, m8 2623 psubw m8, m7 2624 mova [rsp+32], m4 2625.filter_left_s3_end: 2626 punpcklwd xm3, xm0, xm0 2627 vpblendd m4, m3, [tlq+r3*2], 0xfe ; 2 2 3 4 5 6 7 8 9 a b c d e f g 2628 vpblendd m3, [tlq+r3*2-2], 0xfe ; 2 2 2 3 4 5 6 7 8 9 a b c d e f 2629 paddw m1, m4 2630 pavgw m2, m3 2631 paddw m1, m2 2632 psrlw m1, 2 2633 vpblendvb m1, m0, m8 2634 mova [rsp+r3*2+130], m1 2635 jmp .w8_main 2636.filter_left_s3_h8: 2637 mova xm0, [tlq-16] ; 0 1 2 3 4 5 6 7 2638 movu xm3, [tlq-14] ; 1 2 3 4 5 6 7 8 2639 pblendw xm2, xm0, [tlq-18], 0xfe ; 0 0 1 2 3 4 5 6 2640 vpbroadcastd xm5, r7m ; max_height 2641 paddw xm1, xm0, xm3 2642 pblendw xm3, [tlq-12], 0x7f ; 2 3 4 5 6 7 8 8 2643 paddw xm1, xm2 2644 vpblendd xm2, [tlq-20], 0x0e ; 0 0 0 1 2 3 4 5 2645 paddw xm3, xm6 2646 packssdw xm5, xm5 2647 pavgw xm2, xm3 2648 psubw xm5, [base+pw_16to1+16] ; 8to1 2649 paddw xm1, xm2 2650 pminsw xm5, xm11 2651 psrlw xm1, 2 2652 vpblendvb xm1, xm0, xm5 2653 mova [rsp+112], xm1 2654 jmp .w8_main 2655.w64: 2656 mova m2, [tlq+ 32] 2657 mova m3, [tlq+ 64] 2658 mova m4, [tlq+ 96] 2659 movd xm0, [tlq+128] 2660 lea r10d, [hq+(7<<8)] 2661 mova [rsp+160], m2 2662 mova [rsp+192], m3 2663 mova [rsp+224], m4 2664 movd [rsp+256], xm0 2665 test angled, 0x400 2666 jnz .w8_main 2667 vpbroadcastd m6, [base+pw_3] 2668 movu m0, [tlq+34] ; 2 3 4 5 6 7 8 9 a b c d e f g h 2669 paddw m2, m6, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2670 paddw m5, m0, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2671 pavgw m2, [tlq+38] ; 4 5 6 7 8 9 a b c d e f g h h h 2672 paddw m5, [tlq+36] ; 3 4 5 6 7 8 9 a b c d e f g h h 2673 movu m4, [tlq+66] 2674 paddw m3, m6, [tlq+62] 2675 paddw m7, m4, [tlq+64] 2676 pavgw m3, [tlq+70] 2677 paddw m7, [tlq+68] 2678 paddw m2, m5 2679 vpbroadcastd m5, r6m ; max_width 2680 mov r3d, 96 2681 packssdw m5, m5 2682 paddw m3, m7 2683 psubw m5, [base+pw_1to16] 2684 psrlw m2, 2 2685 vpbroadcastd m7, [base+pw_16] 2686 psrlw m3, 2 2687 pminsw m8, m11, m5 2688 psubw m9, m8, m7 2689 vpblendvb m2, m0, m9 2690 psubw m9, m7 2691 vpblendvb m3, m4, m9 2692 psubw m9, m7 2693 movu [rsp+162], m2 2694 movu [rsp+194], m3 2695 jmp .w32_filter_above 2696 2697cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase 2698 %assign org_stack_offset stack_offset 2699 lea r6, [ipred_z3_16bpc_avx2_table] 2700 tzcnt hd, hm 2701 movifnidn angled, anglem 2702 lea r7, [dr_intra_derivative+45*2-1] 2703 sub tlq, 2 2704 movsxd hq, [r6+hq*4] 2705 sub angled, 180 2706 add hq, r6 2707 mov dyd, angled 2708 neg dyd 2709 xor angled, 0x400 2710 or dyq, ~0x7e 2711 movzx dyd, word [r7+dyq] 2712 vpbroadcastd m5, [pw_62] 2713 mov org_wd, wd 2714 jmp hq 2715.h4: 2716 ALLOC_STACK -64, 7 2717 lea r7, [strideq*3] 2718 cmp angleb, 40 2719 jae .h4_no_upsample 2720 lea r4d, [angleq-1024] 2721 sar r4d, 7 2722 add r4d, wd 2723 jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) 2724 mova xm2, [tlq-14] ; 0 1 2 3 4 5 6 7 2725 pblendw xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 2726 vpblendd xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 2727 pshufd xm3, xm1, q0000 2728 paddw xm1, xm2 2729 paddw xm0, [tlq-12] ; 1 2 3 4 5 6 7 8 2730 vpbroadcastw xm4, r8m ; pixel_max 2731 add dyd, dyd 2732 psubw xm0, xm1, xm0 2733 mova [rsp+ 0], xm3 2734 movd xm3, dyd 2735 psraw xm0, 3 2736 neg dyd 2737 paddw xm1, xm0 2738 pxor xm0, xm0 2739 lea r2d, [dyq+(16<<6)+63] ; ypos 2740 pmaxsw xm1, xm0 2741 pavgw xm1, xm0 2742 vpbroadcastw m3, xm3 2743 pminsw xm1, xm4 2744 punpckhwd xm0, xm1, xm2 2745 punpcklwd xm1, xm2 2746 paddw m2, m3, m3 2747 mova [rsp+32], xm0 2748 punpcklwd m3, m2 2749 mova [rsp+16], xm1 2750 paddw m4, m2, m2 2751 paddw m2, m3 2752 vpblendd m3, m2, 0xf0 ; ypos0 ypos1 ypos2 ypos3 2753.h4_upsample_loop: 2754 lea r4d, [r2+dyq] 2755 shr r2d, 6 2756 movu xm1, [rsp+r2*2] 2757 lea r2d, [r4+dyq] 2758 shr r4d, 6 2759 movu xm2, [rsp+r4*2] 2760 lea r4d, [r2+dyq] 2761 shr r2d, 6 2762 vinserti128 m1, [rsp+r2*2], 1 2763 lea r2d, [r4+dyq] 2764 shr r4d, 6 2765 vinserti128 m2, [rsp+r4*2], 1 2766 psrld m0, m1, 16 2767 pblendw m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 2768 pslld m2, 16 2769 pblendw m1, m2, 0xaa 2770 pand m2, m5, m3 2771 psllw m2, 9 2772 psubw m1, m0 2773 pmulhrsw m1, m2 2774 paddw m3, m4 2775 paddw m1, m0 2776 vextracti128 xm2, m1, 1 2777 punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 2778 punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 2779 movhps [dstq+strideq*0], xm0 2780 movq [dstq+strideq*1], xm0 2781 movhps [dstq+strideq*2], xm1 2782 movq [dstq+r7 ], xm1 2783 add dstq, 8 2784 sub wd, 4 2785 jg .h4_upsample_loop 2786 RET 2787ALIGN function_align 2788.filter_strength: ; h4/h8/h16 2789%define base r4-z_filter_t0 2790 lea r4, [z_filter_t0] 2791 movd xm0, maxbased 2792 movd xm1, angled 2793 shr angled, 8 ; is_sm << 1 2794 vpbroadcastb m0, xm0 2795 vpbroadcastb m1, xm1 2796 pcmpeqb m0, [base+z_filter_wh] 2797 pand m0, m1 2798 mova xm1, [r4+angleq*8] 2799 pcmpgtb m0, m1 2800 pmovmskb r5d, m0 2801 ret 2802.h4_no_upsample: 2803 mov maxbased, 7 2804 test angled, 0x400 ; !enable_intra_edge_filter 2805 jnz .h4_main 2806 lea maxbased, [wq+3] 2807 call .filter_strength 2808 mov maxbased, 7 2809 test r5d, r5d 2810 jz .h4_main ; filter_strength == 0 2811 popcnt r5d, r5d 2812 mova xm0, [tlq-14] ; 0 1 2 3 4 5 6 7 2813 movu xm3, [tlq-12] ; 1 2 3 4 5 6 7 8 2814 vpbroadcastd xm2, [base+z_filter_k-4+r5*4+12*1] 2815 vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] 2816 pmullw xm2, xm0 2817 pblendw xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 2818 paddw xm1, xm0, xm3 2819 movd [rsp+12], xm0 2820 pmullw xm1, xm4 2821 cmp r5d, 3 2822 jne .h4_filter_3tap 2823 pblendw xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8 2824 vpblendd xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 2825 movzx r4d, word [tlq-14] 2826 movzx r2d, word [tlq-12] 2827 inc maxbased 2828 paddw xm1, xm2 2829 paddw xm0, xm3 2830 sub r2d, r4d 2831 paddw xm2, xm0, xm0 2832 lea r2d, [r2+r4*8+4] 2833 shr r2d, 3 2834 mov [rsp+14], r2w 2835.h4_filter_3tap: 2836 pxor xm0, xm0 2837 paddw xm1, xm2 2838 lea tlq, [rsp+30] 2839 psrlw xm1, 3 2840 cmp wd, 8 2841 sbb maxbased, -1 2842 pavgw xm0, xm1 2843 mova [rsp+16], xm0 2844.h4_main: 2845 movd xm3, dyd 2846 neg maxbaseq 2847 vbroadcasti128 m1, [z_base_inc] 2848 vpbroadcastw m6, [tlq+maxbaseq*2] 2849 shl maxbased, 6 2850 vpbroadcastw m3, xm3 2851 lea r4d, [maxbaseq+3*64] 2852 neg dyq 2853 movd xm2, r4d 2854 sub tlq, 8 2855 lea r4, [dyq+63] ; ypos 2856 punpcklwd m1, m1 2857 paddw m0, m3, m3 2858 vpbroadcastw m2, xm2 2859 punpcklwd m3, m0 2860 paddw m4, m0, m0 2861 paddw m0, m3 2862 psubw m2, m1 2863 vpblendd m3, m0, 0xf0 ; ypos0 ypos1 ypos2 ypos3 2864 or maxbased, 63 2865 paddw m3, m2 2866.h4_loop: 2867 lea r5, [r4+dyq] 2868 sar r4, 6 ; base0 2869 movu xm1, [tlq+r4*2] 2870 lea r4, [r5+dyq] 2871 sar r5, 6 ; base1 2872 movu xm2, [tlq+r5*2] 2873 lea r5, [r4+dyq] 2874 sar r4, 6 ; base2 2875 vinserti128 m1, [tlq+r4*2], 1 2876 lea r4, [r5+dyq] 2877 sar r5, 6 ; base3 2878 vinserti128 m2, [tlq+r5*2], 1 2879 punpckhwd m0, m1, m2 2880 punpcklwd m1, m2 2881 pand m2, m5, m3 2882 palignr m0, m1, 4 ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 2883 psllw m2, 9 2884 psubw m1, m0 2885 pmulhrsw m1, m2 2886 psraw m2, m3, 15 ; ypos < max_base_y 2887 paddw m3, m4 2888 paddw m1, m0 2889 vpblendvb m1, m6, m1, m2 2890 vextracti128 xm2, m1, 1 2891 punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 2892 punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 2893 movhps [dstq+strideq*0], xm0 2894 movq [dstq+strideq*1], xm0 2895 movhps [dstq+strideq*2], xm1 2896 movq [dstq+r7 ], xm1 2897 sub wd, 4 2898 jz .h4_end 2899 add dstq, 8 2900 cmp r4d, maxbased 2901 jg .h4_loop 2902.h4_end_loop: 2903 movq [dstq+strideq*0], xm6 2904 movq [dstq+strideq*1], xm6 2905 movq [dstq+strideq*2], xm6 2906 movq [dstq+r7 ], xm6 2907 add dstq, 8 2908 sub wd, 4 2909 jg .h4_end_loop 2910.h4_end: 2911 RET 2912.h8: 2913 lea r4d, [angleq+216] 2914 %assign stack_offset org_stack_offset 2915 ALLOC_STACK -64, 8 2916 mov r4b, wb 2917 lea r7, [strideq*3] 2918 cmp r4d, 8 2919 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 2920 mova m2, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 2921 paddw m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6 7 8 9 a b c d e 2922 movu m0, [tlq-34] ; _ _ 0 1 2 3 4 5 6 7 8 9 a b c d 2923 cmp wd, 8 2924 je .h8_upsample_w8 2925 pshufhw xm3, xm2, q1000 2926 vpblendd m0, m3, 0x0f ; _ _ _ _ 4 4 4 5 6 7 8 9 a b c d 2927.h8_upsample_w8: 2928 paddw m0, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 2929 vpbroadcastw m4, r8m ; pixel_max 2930 add dyd, dyd 2931 psubw m0, m1, m0 2932 movd xm6, dyd 2933 psraw m0, 3 2934 neg dyd 2935 paddw m1, m0 2936 pxor m0, m0 2937 pmaxsw m1, m0 2938 lea r4d, [dyq+(16<<6)+63] ; ypos 2939 pavgw m1, m0 2940 vpbroadcastw m6, xm6 2941 pminsw m1, m4 2942 punpckhwd m0, m1, m2 2943 punpcklwd m1, m2 2944 vextracti128 [rsp+48], m0, 1 2945 vextracti128 [rsp+32], m1, 1 2946 paddw m7, m6, m6 2947 mova [rsp+16], xm0 2948 mova [rsp+ 0], xm1 2949 punpcklwd m6, m7 ; ypos0 ypos1 2950.h8_upsample_loop: 2951 lea r2d, [r4+dyq] 2952 shr r4d, 6 ; base0 2953 movu m1, [rsp+r4*2] 2954 lea r4d, [r2+dyq] 2955 shr r2d, 6 ; base1 2956 movu m2, [rsp+r2*2] 2957 lea r2d, [r4+dyq] 2958 shr r4d, 6 ; base2 2959 movu m3, [rsp+r4*2] 2960 lea r4d, [r2+dyq] 2961 shr r2d, 6 ; base3 2962 movu m4, [rsp+r2*2] 2963 psrld m0, m1, 16 2964 pblendw m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0 2965 pslld m2, 16 2966 pblendw m1, m2, 0xaa 2967 psrld m2, m3, 16 2968 pblendw m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4 c3 d3 c2 d2 c1 d1 c0 d0 2969 pslld m4, 16 2970 pblendw m3, m4, 0xaa 2971 pand m4, m5, m6 2972 paddw m6, m7 2973 psllw m4, 9 2974 psubw m1, m0 2975 pmulhrsw m1, m4 2976 pand m4, m5, m6 2977 psllw m4, 9 2978 psubw m3, m2 2979 pmulhrsw m3, m4 2980 paddw m6, m7 2981 lea r2, [dstq+strideq*4] 2982 paddw m1, m0 2983 paddw m3, m2 2984 punpckhdq m0, m1, m3 ; a5 b5 c5 d5 a4 b4 c4 d4 a1 b1 c1 d1 a0 b0 c0 d0 2985 punpckldq m1, m3 ; a7 b7 c7 d7 a6 b6 c6 d6 a3 b3 c3 d3 a2 b2 c2 d2 2986 vextracti128 xm2, m0, 1 2987 vextracti128 xm3, m1, 1 2988 movhps [r2 +strideq*0], xm0 2989 movq [r2 +strideq*1], xm0 2990 movhps [r2 +strideq*2], xm1 2991 movq [r2 +r7 ], xm1 2992 movhps [dstq+strideq*0], xm2 2993 movq [dstq+strideq*1], xm2 2994 movhps [dstq+strideq*2], xm3 2995 movq [dstq+r7 ], xm3 2996 add dstq, 8 2997 sub wd, 4 2998 jg .h8_upsample_loop 2999 RET 3000.h8_no_intra_edge_filter: 3001 and maxbased, 7 3002 or maxbased, 8 ; imin(w+7, 15) 3003 jmp .h8_main 3004.h8_no_upsample: 3005 lea maxbased, [wq+7] 3006 test angled, 0x400 3007 jnz .h8_no_intra_edge_filter 3008 call .filter_strength 3009 test r5d, r5d 3010 jz .h8_main 3011 popcnt r5d, r5d 3012 mova m0, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3013 movu m3, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3014 vpbroadcastd m2, [base+z_filter_k-4+r5*4+12*1] 3015 vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] 3016 pmullw m2, m0 3017 cmp wd, 8 3018 jl .h8_filter_w4 3019 punpcklwd xm0, xm0 3020 vpblendd m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 3021 movd [rsp+28], xm0 3022 paddw m1, m3 3023 mov r4d, 16 3024 pmullw m1, m4 3025 cmovg maxbased, r4d 3026 cmp r5d, 3 3027 jne .h8_filter_3tap 3028 punpckhwd m3, m3 3029 vpblendd m0, [tlq-34], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d 3030 vpblendd m3, [tlq-26], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g 3031 movzx r4d, word [tlq-30] 3032 movzx r2d, word [tlq-28] 3033 inc maxbased 3034 paddw m1, m2 3035 paddw m0, m3 3036 sub r2d, r4d 3037 paddw m2, m0, m0 3038 lea r2d, [r2+r4*8+4] 3039 shr r2d, 3 3040 mov [rsp+30], r2w 3041 jmp .h8_filter_3tap 3042.h8_filter_w4: 3043 pshufhw xm1, xm0, q2100 3044 vinserti128 m1, [tlq-16], 1 ; _ _ _ _ 4 4 5 6 7 8 9 a b c d e 3045 paddw m1, m3 3046 pmullw m1, m4 3047.h8_filter_3tap: 3048 pxor m0, m0 3049 paddw m1, m2 3050 lea tlq, [rsp+62] 3051 psrlw m1, 3 3052 pavgw m0, m1 3053 mova [rsp+32], m0 3054.h8_main: 3055 movd xm4, dyd 3056 neg maxbaseq 3057 vbroadcasti128 m1, [z_base_inc] 3058 vpbroadcastw m7, [tlq+maxbaseq*2] 3059 shl maxbased, 6 3060 vpbroadcastw m4, xm4 3061 lea r4d, [maxbaseq+7*64] 3062 neg dyq 3063 movd xm2, r4d 3064 sub tlq, 16 3065 lea r4, [dyq+63] 3066 paddw m6, m4, m4 3067 vpbroadcastw m2, xm2 3068 vpblendd m4, m6, 0xf0 ; ypos0 ypos1 3069 psubw m2, m1 3070 or maxbased, 63 3071 paddw m4, m2 3072.h8_loop: 3073 lea r5, [r4+dyq] 3074 sar r4, 6 ; base0 3075 movu xm0, [tlq+r4*2+2] 3076 movu xm1, [tlq+r4*2] 3077 lea r4, [r5+dyq] 3078 sar r5, 6 ; base1 3079 vinserti128 m0, [tlq+r5*2+2], 1 3080 vinserti128 m1, [tlq+r5*2], 1 3081 lea r5, [r4+dyq] 3082 sar r4, 6 ; base2 3083 pand m3, m5, m4 3084 psllw m3, 9 3085 psubw m1, m0 3086 pmulhrsw m1, m3 3087 psraw m3, m4, 15 3088 paddw m4, m6 3089 paddw m0, m1 3090 movu xm1, [tlq+r4*2+2] 3091 movu xm2, [tlq+r4*2] 3092 lea r4, [r5+dyq] 3093 sar r5, 6 ; base3 3094 vpblendvb m0, m7, m0, m3 3095 vinserti128 m1, [tlq+r5*2+2], 1 3096 vinserti128 m2, [tlq+r5*2], 1 3097 pand m3, m5, m4 3098 psllw m3, 9 3099 psubw m2, m1 3100 pmulhrsw m2, m3 3101 psraw m3, m4, 15 3102 paddw m4, m6 3103 lea r5, [dstq+strideq*4] 3104 paddw m1, m2 3105 vpblendvb m1, m7, m1, m3 3106 punpckhwd m2, m0, m1 ; a3 c3 a2 c2 a1 c1 a0 c0 b3 d3 b2 d2 b1 d1 b0 d0 3107 vextracti128 xm3, m2, 1 3108 punpcklwd m0, m1 ; a7 c7 a6 c6 a5 c5 a4 c5 b7 d7 b6 d6 b5 d5 b4 d4 3109 punpckhwd xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0 3110 punpcklwd xm2, xm3 ; a3 b3 c3 d3 a2 b2 c2 d2 3111 vextracti128 xm3, m0, 1 3112 movhps [dstq+strideq*0], xm1 3113 movq [dstq+strideq*1], xm1 3114 movhps [dstq+strideq*2], xm2 3115 movq [dstq+r7 ], xm2 3116 punpckhwd xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4 3117 punpcklwd xm0, xm3 ; a7 b7 c7 d7 a6 b6 c6 d6 3118 movhps [r5 +strideq*0], xm1 3119 movq [r5 +strideq*1], xm1 3120 movhps [r5 +strideq*2], xm0 3121 movq [r5 +r7 ], xm0 3122 sub wd, 4 3123 jz .h8_end 3124 add dstq, 8 3125 cmp r4d, maxbased 3126 jg .h8_loop 3127 lea r6, [strideq*5] 3128 lea r2, [strideq+r7*2] ; stride*7 3129 test wd, 4 3130 jz .h8_end_loop 3131 movq [dstq+strideq*0], xm7 3132 movq [dstq+strideq*1], xm7 3133 movq [dstq+strideq*2], xm7 3134 movq [dstq+r7 ], xm7 3135 movq [dstq+strideq*4], xm7 3136 movq [dstq+r6 ], xm7 3137 movq [dstq+r7*2 ], xm7 3138 movq [dstq+r2 ], xm7 3139 add dstq, 8 3140 sub wd, 4 3141 jz .h8_end 3142.h8_end_loop: 3143 mova [dstq+strideq*0], xm7 3144 mova [dstq+strideq*1], xm7 3145 mova [dstq+strideq*2], xm7 3146 mova [dstq+r7 ], xm7 3147 mova [dstq+strideq*4], xm7 3148 mova [dstq+r6 ], xm7 3149 mova [dstq+r7*2 ], xm7 3150 mova [dstq+r2 ], xm7 3151 add dstq, 16 3152 sub wd, 8 3153 jg .h8_end_loop 3154.h8_end: 3155 RET 3156.h16_no_intra_edge_filter: 3157 and maxbased, 15 3158 or maxbased, 16 ; imin(w+15, 31) 3159 jmp .h16_main 3160ALIGN function_align 3161.h16: 3162 %assign stack_offset org_stack_offset 3163 ALLOC_STACK -96, 10 3164 lea maxbased, [wq+15] 3165 lea r7, [strideq*3] 3166 test angled, 0x400 3167 jnz .h16_no_intra_edge_filter 3168 call .filter_strength 3169 test r5d, r5d 3170 jz .h16_main ; filter_strength == 0 3171 popcnt r5d, r5d 3172 movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i 3173 paddw m1, m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3174 vpbroadcastd m6, [base+z_filter_k-4+r5*4+12*1] 3175 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] 3176 pmullw m2, m6, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3177 pmullw m1, m7 3178 paddw m1, m2 3179 cmp wd, 8 3180 jg .h16_filter_w16 3181 mova xm3, [tlq-46] ; 0 1 2 3 4 5 6 7 3182 pmullw xm6, xm3 3183 jl .h16_filter_w4 3184 pblendw xm3, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 3185 cmp r5d, 3 3186 jne .h16_filter_w8_3tap 3187 vpblendd xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 3188.h16_filter_w8_5tap: 3189 punpckhwd m0, m0 3190 vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i 3191 paddw xm4, [tlq-42] ; 2 3 4 5 6 7 8 9 3192 paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3193 paddw xm4, xm4 3194 paddw m0, m0 3195 paddw xm6, xm4 3196 paddw m1, m0 3197.h16_filter_w8_3tap: 3198 paddw xm3, [tlq-44] ; 1 2 3 4 5 6 7 8 3199 pmullw xm3, xm7 3200 pxor m0, m0 3201 paddw xm3, xm6 3202 psrlw xm3, 3 3203 pavgw xm3, xm0 3204 mova [rsp+48], xm3 3205 jmp .h16_filter_end 3206.h16_filter_w4: 3207 pshufhw xm3, xm3, q2100 ; _ _ _ _ 4 4 5 6 3208 cmp r5d, 3 3209 jne .h16_filter_w8_3tap 3210 pshufhw xm4, xm3, q2100 ; _ _ _ _ 4 4 4 5 3211 jmp .h16_filter_w8_5tap 3212.h16_filter_w16: 3213 mova m3, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3214 pmullw m6, m3 3215 punpcklwd xm3, xm3 3216 vpblendd m4, m3, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 3217 paddw m4, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3218 mov r4d, 32 3219 cmp wd, 16 3220 cmovg maxbased, r4d 3221 movd [rsp+28], xm3 3222 pmullw m4, m7 3223 cmp r5d, 3 3224 jne .h16_filter_w16_3tap 3225 punpckhwd m0, m0 3226 vpblendd m3, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d 3227 vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i 3228 paddw m3, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3229 paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3230 movzx r4d, word [tlq-62] 3231 movzx r2d, word [tlq-60] 3232 or maxbased, 1 3233 paddw m3, m3 3234 sub r2d, r4d 3235 paddw m0, m0 3236 lea r2d, [r2+r4*8+4] 3237 paddw m4, m3 3238 shr r2d, 3 3239 paddw m1, m0 3240 mov [rsp+30], r2w 3241.h16_filter_w16_3tap: 3242 pxor m0, m0 3243 paddw m4, m6 3244 psrlw m4, 3 3245 pavgw m4, m0 3246 mova [rsp+32], m4 3247.h16_filter_end: 3248 psrlw m1, 3 3249 lea tlq, [rsp+94] 3250 pavgw m1, m0 3251 mova [rsp+64], m1 3252.h16_main: 3253 movd xm8, dyd 3254 neg maxbaseq 3255 vpbroadcastw m9, [tlq+maxbaseq*2] 3256 shl maxbased, 6 3257 vpbroadcastw m8, xm8 3258 lea r4d, [maxbaseq+dyq+15*64] 3259 neg dyq 3260 movd xm7, r4d 3261 sub tlq, 32 3262 lea r4, [dyq+63] 3263 vpbroadcastw m7, xm7 3264 or maxbased, 63 3265 psubw m7, [z_base_inc] 3266.h16_loop: 3267 lea r5, [r4+dyq] 3268 sar r4, 6 ; base0 3269 movu m0, [tlq+r4*2+2] 3270 movu m2, [tlq+r4*2] 3271 lea r4, [r5+dyq] 3272 sar r5, 6 ; base1 3273 movu m1, [tlq+r5*2+2] 3274 movu m3, [tlq+r5*2] 3275 lea r5, [r4+dyq] 3276 sar r4, 6 ; base3 3277 pand m6, m5, m7 3278 psllw m6, 9 3279 psubw m2, m0 3280 pmulhrsw m2, m6 3281 psraw m6, m7, 15 3282 paddw m7, m8 3283 paddw m0, m2 3284 movu m2, [tlq+r4*2+2] 3285 movu m4, [tlq+r4*2] 3286 lea r4, [r5+dyq] 3287 sar r5, 6 ; base3 3288 vpblendvb m0, m9, m0, m6 3289 pand m6, m5, m7 3290 psllw m6, 9 3291 psubw m3, m1 3292 pmulhrsw m3, m6 3293 psraw m6, m7, 15 3294 paddw m7, m8 3295 paddw m1, m3 3296 vpblendvb m1, m9, m1, m6 3297 pand m6, m5, m7 3298 psllw m6, 9 3299 psubw m4, m2 3300 pmulhrsw m4, m6 3301 psraw m6, m7, 15 3302 paddw m7, m8 3303 paddw m2, m4 3304 movu m3, [tlq+r5*2+2] 3305 movu m4, [tlq+r5*2] 3306 vpblendvb m2, m9, m2, m6 3307 pand m6, m5, m7 3308 psllw m6, 9 3309 psubw m4, m3 3310 pmulhrsw m4, m6 3311 psraw m6, m7, 15 3312 paddw m7, m8 3313 lea r5, [dstq+strideq*4] 3314 paddw m3, m4 3315 vpblendvb m3, m9, m3, m6 3316 punpckhwd m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8 a3 b3 a2 b2 a1 b1 a0 b0 3317 punpcklwd m0, m1 ; af bf ae be ad bd ac bc a7 b7 a6 b6 a5 b5 a4 b4 3318 punpckhwd m1, m2, m3 ; cb db ca da c9 d9 c8 d8 c3 d3 c2 d2 c1 d1 c0 d0 3319 punpcklwd m2, m3 ; cf df ce de cd dd cc dc c7 d7 c6 d6 c5 d5 c4 d4 3320 punpckhdq m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8 a1 b1 c1 d1 a0 b0 c0 d0 3321 vextracti128 xm6, m3, 1 3322 punpckldq m4, m1 ; ab bb cb db aa ba ca da a3 b3 c3 d3 a2 b2 c2 d2 3323 punpckhdq m1, m0, m2 ; ad bd cd dd ac bc cc dc a5 b5 c5 d5 a4 b4 c4 d4 3324 punpckldq m0, m2 ; af bf cf df ae be ce de a7 b7 c7 d7 a6 b6 c6 d6 3325 vextracti128 xm2, m4, 1 3326 movhps [dstq+strideq*0], xm6 3327 movq [dstq+strideq*1], xm6 3328 vextracti128 xm6, m1, 1 3329 movhps [dstq+strideq*2], xm2 3330 movq [dstq+r7 ], xm2 3331 vextracti128 xm2, m0, 1 3332 movhps [r5 +strideq*0], xm6 3333 movq [r5 +strideq*1], xm6 3334 movhps [r5 +strideq*2], xm2 3335 movq [r5 +r7 ], xm2 3336 lea r5, [dstq+strideq*8] 3337 movhps [r5 +strideq*0], xm3 3338 movq [r5 +strideq*1], xm3 3339 movhps [r5 +strideq*2], xm4 3340 movq [r5 +r7 ], xm4 3341 lea r5, [r5+strideq*4] 3342 movhps [r5 +strideq*0], xm1 3343 movq [r5 +strideq*1], xm1 3344 movhps [r5 +strideq*2], xm0 3345 movq [r5 +r7 ], xm0 3346 sub wd, 4 3347 jz .h16_end 3348 add dstq, 8 3349 cmp r4d, maxbased 3350 jg .h16_loop 3351 mov hd, 4 3352.h16_end_loop0: 3353 mov r6d, wd 3354 mov r2, dstq 3355 test wb, 4 3356 jz .h16_end_loop 3357 movq [dstq+strideq*0], xm9 3358 movq [dstq+strideq*1], xm9 3359 movq [dstq+strideq*2], xm9 3360 movq [dstq+r7 ], xm9 3361 and r6d, 120 3362 jz .h16_end_w4 3363 add dstq, 8 3364.h16_end_loop: 3365 mova [dstq+strideq*0], xm9 3366 mova [dstq+strideq*1], xm9 3367 mova [dstq+strideq*2], xm9 3368 mova [dstq+r7 ], xm9 3369 add dstq, 16 3370 sub r6d, 8 3371 jg .h16_end_loop 3372.h16_end_w4: 3373 lea dstq, [r2+strideq*4] 3374 dec hd 3375 jg .h16_end_loop0 3376.h16_end: 3377 RET 3378.h32: 3379 %assign stack_offset org_stack_offset 3380 ALLOC_STACK -160, 9 3381 lea maxbased, [wq+31] 3382 and maxbased, 31 3383 or maxbased, 32 ; imin(w+31, 63) 3384 test angled, 0x400 3385 jnz .h32_main 3386 vpbroadcastd m2, [pw_3] 3387 movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i 3388 punpckhwd m1, m0, m0 3389 vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i 3390 paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3391 paddw m1, m2 3392 paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3393 pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3394 lea r4, [rsp+128] 3395 paddw m0, m1 3396 lea r5d, [maxbaseq-31] 3397 psrlw m0, 2 3398 mova [r4], m0 3399.h32_filter_loop: 3400 mova m0, [tlq-62] 3401 paddw m1, m2, [tlq-66] 3402 paddw m0, [tlq-64] 3403 pavgw m1, [tlq-58] 3404 paddw m0, [tlq-60] 3405 sub tlq, 32 3406 sub r4, 32 3407 paddw m0, m1 3408 psrlw m0, 2 3409 mova [r4], m0 3410 sub r5d, 16 3411 jg .h32_filter_loop 3412 jl .h32_filter_h8 3413 mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3414 punpcklwd xm1, xm0, xm0 3415 paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3416 paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3417 vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d 3418 vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 3419 movzx r5d, word [tlq-62] 3420 movzx r2d, word [tlq-60] 3421 pavgw m2, m3 3422 sub r2d, r5d 3423 paddw m0, m1 3424 lea r2d, [r2+r5*8+4] 3425 paddw m0, m2 3426 shr r2d, 3 3427 psrlw m0, 2 3428 mova [r4-32], m0 3429 mov [r4-36], r5w 3430 mov [r4-34], r2w 3431 lea tlq, [rsp+158] 3432 mov r4d, 65 3433 cmp wd, 64 3434 cmove maxbased, r4d 3435 jmp .h32_main 3436.h32_filter_h8: 3437 mova xm0, [tlq-46] ; 0 1 2 3 4 5 6 7 3438 pblendw xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 3439 paddw xm2, [tlq-42] ; 2 3 4 5 6 7 8 9 3440 paddw xm0, [tlq-44] ; 1 2 3 4 5 6 7 8 3441 vpblendd xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 3442 lea tlq, [rsp+158] 3443 pavgw xm2, xm3 3444 paddw xm0, xm1 3445 paddw xm0, xm2 3446 psrlw xm0, 2 3447 mova [r4-16], xm0 3448.h32_main: 3449 movd xm6, dyd 3450 neg maxbaseq 3451 vpbroadcastw m7, [tlq+maxbaseq*2] 3452 shl maxbased, 6 3453 vpbroadcastw m6, xm6 3454 lea r4d, [maxbaseq+dyq+15*64] 3455 neg dyq 3456 movd xm4, r4d 3457 vpbroadcastd m8, [pw_m1024] 3458 lea r4, [dyq+63] 3459 vpbroadcastw m4, xm4 3460 or maxbased, 63 3461 psubw m4, [z_base_inc] 3462.h32_loop: 3463 mov r5, r4 3464 sar r5, 6 3465 movu m1, [tlq+r5*2-64] 3466 movu m0, [tlq+r5*2-62] 3467 pand m3, m5, m4 3468 psllw m3, 9 3469 psubw m1, m0 3470 pmulhrsw m1, m3 3471 pcmpgtw m2, m8, m4 3472 paddw m0, m1 3473 vpblendvb m0, m7, m0, m2 3474 movu m2, [tlq+r5*2-32] 3475 movu m1, [tlq+r5*2-30] 3476 add r4, dyq 3477 sub rsp, 64 3478 psubw m2, m1 3479 pmulhrsw m2, m3 3480 psraw m3, m4, 15 3481 paddw m4, m6 3482 mova [rsp+32*0], m0 3483 paddw m1, m2 3484 vpblendvb m1, m7, m1, m3 3485 mova [rsp+32*1], m1 3486 dec wd 3487 jz .h32_transpose 3488 cmp r4d, maxbased 3489 jg .h32_loop 3490.h32_end_loop: 3491 sub rsp, 64 3492 mova [rsp+32*0], m7 3493 mova [rsp+32*1], m7 3494 dec wd 3495 jg .h32_end_loop 3496.h32_transpose: 3497 lea r3, [strideq*3] 3498 lea r4, [strideq*5] 3499 mov r8, dstq 3500 lea r5, [strideq+r3*2] 3501.h32_transpose_loop0: 3502 lea r6, [rsp+32] 3503 lea r2, [r8+org_wq*2-16] 3504.h32_transpose_loop: 3505 mova m0, [r6+64*7] 3506 mova m1, [r6+64*6] 3507 mova m2, [r6+64*5] 3508 mova m3, [r6+64*4] 3509 mova m4, [r6+64*3] 3510 mova m5, [r6+64*2] 3511 mova m6, [r6+64*1] 3512 mova m7, [r6+64*0] 3513 punpckhwd m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0 3514 punpcklwd m0, m1 ; a7 b7 a6 b6 a5 b5 a4 b4 3515 punpckhwd m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0 3516 punpcklwd m2, m3 ; c7 d7 c6 d6 c5 d5 c4 d4 3517 punpckhwd m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0 3518 punpcklwd m4, m5 ; e7 f7 e6 f6 e5 f5 e4 f4 3519 punpckhwd m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0 3520 punpcklwd m6, m7 ; g7 h7 g6 h6 g5 h5 g4 h4 3521 lea dstq, [r2+strideq*8] 3522 sub r6, 32 3523 punpckhdq m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0 3524 punpckldq m8, m1 ; a3 b3 c3 d3 a2 b2 c2 d2 3525 punpckhdq m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0 3526 punpckldq m3, m5 ; e3 f3 g3 h3 e2 f2 g2 h2 3527 punpckhqdq m5, m7, m1 ; 8 0 3528 vextracti128 [r2 +strideq*0], m5, 1 3529 punpcklqdq m7, m1 ; 9 1 3530 mova [dstq+strideq*0], xm5 3531 punpckhqdq m1, m8, m3 ; 10 2 3532 vextracti128 [r2 +strideq*1], m7, 1 3533 punpcklqdq m8, m3 ; 11 3 3534 mova [dstq+strideq*1], xm7 3535 punpckhdq m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4 3536 vextracti128 [r2 +strideq*2], m1, 1 3537 punpckldq m0, m2 ; a7 b7 c7 d7 a6 b6 c6 d6 3538 mova [dstq+strideq*2], xm1 3539 punpckhdq m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4 3540 vextracti128 [r2 +r3 ], m8, 1 3541 punpckldq m4, m6 ; e7 f7 g7 h7 e6 f6 g6 h6 3542 mova [dstq+r3 ], xm8 3543 punpckhqdq m6, m3, m2 ; 12 4 3544 vextracti128 [r2 +strideq*4], m6, 1 3545 punpcklqdq m3, m2 ; 13 5 3546 mova [dstq+strideq*4], xm6 3547 punpckhqdq m2, m0, m4 ; 14 6 3548 vextracti128 [r2 +r4 ], m3, 1 3549 punpcklqdq m0, m4 ; 15 7 3550 mova [dstq+r4 ], xm3 3551 vextracti128 [r2 +r3*2 ], m2, 1 3552 mova [dstq+r3*2 ], xm2 3553 vextracti128 [r2 +r5 ], m0, 1 3554 mova [dstq+r5 ], xm0 3555 lea r2, [dstq+strideq*8] 3556 cmp r6, rsp 3557 jae .h32_transpose_loop 3558 add rsp, 64*8 3559 sub org_wd, 8 3560 jg .h32_transpose_loop0 3561.h32_end: 3562 RET 3563.h64: 3564 %assign stack_offset org_stack_offset 3565 ALLOC_STACK -256, 10 3566 lea maxbased, [wq+63] 3567 test angled, 0x400 3568 jnz .h64_main 3569 vpbroadcastd m2, [pw_3] 3570 movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i 3571 punpckhwd m1, m0, m0 3572 vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i 3573 paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3574 paddw m1, m2 3575 paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3576 pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3577 lea r4, [rsp+224] 3578 paddw m0, m1 3579 lea r5d, [wq+32] 3580 psrlw m0, 2 3581 mova [r4], m0 3582.h64_filter_loop: 3583 mova m0, [tlq-62] 3584 paddw m1, m2, [tlq-66] 3585 paddw m0, [tlq-64] 3586 pavgw m1, [tlq-58] 3587 paddw m0, [tlq-60] 3588 sub tlq, 32 3589 sub r4, 32 3590 paddw m0, m1 3591 psrlw m0, 2 3592 mova [r4], m0 3593 sub r5d, 16 3594 jg .h64_filter_loop 3595 mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f 3596 punpcklwd xm1, xm0, xm0 3597 paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h 3598 paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g 3599 vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d 3600 vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e 3601 lea tlq, [rsp+254] 3602 pavgw m2, m3 3603 paddw m0, m1 3604 paddw m0, m2 3605 psrlw m0, 2 3606 mova [r4-32], m0 3607.h64_main: 3608 neg maxbaseq 3609 movd xm4, dyd 3610 vpbroadcastw m6, [tlq+maxbaseq*2] 3611 shl maxbased, 6 3612 vpbroadcastw m4, xm4 3613 lea r4d, [maxbaseq+dyq+15*64] 3614 neg dyq 3615 vpbroadcastd m7, [pw_m1024] 3616 movd xm3, r4d 3617 lea r4, [dyq+63] 3618 paddw m8, m7, m7 3619 vpbroadcastw m3, xm3 3620 or maxbased, 63 3621 paddw m9, m8, m7 3622 psubw m3, [z_base_inc] 3623.h64_loop: 3624 mov r5, r4 3625 sar r5, 6 3626 movu m1, [tlq+r5*2-128] 3627 movu m0, [tlq+r5*2-126] 3628 pand m2, m5, m3 3629 psllw m2, 9 3630 psubw m1, m0 3631 pmulhrsw m1, m2 3632 sub rsp, 128 3633 paddw m0, m1 3634 pcmpgtw m1, m9, m3 3635 vpblendvb m0, m6, m0, m1 3636 mova [rsp+32*0], m0 3637 movu m1, [tlq+r5*2-96] 3638 movu m0, [tlq+r5*2-94] 3639 psubw m1, m0 3640 pmulhrsw m1, m2 3641 paddw m0, m1 3642 pcmpgtw m1, m8, m3 3643 vpblendvb m0, m6, m0, m1 3644 mova [rsp+32*1], m0 3645 movu m1, [tlq+r5*2-64] 3646 movu m0, [tlq+r5*2-62] 3647 psubw m1, m0 3648 pmulhrsw m1, m2 3649 paddw m0, m1 3650 pcmpgtw m1, m7, m3 3651 vpblendvb m0, m6, m0, m1 3652 mova [rsp+32*2], m0 3653 movu m1, [tlq+r5*2-32] 3654 movu m0, [tlq+r5*2-30] 3655 psubw m1, m0 3656 pmulhrsw m1, m2 3657 add r4, dyq 3658 psraw m2, m3, 15 3659 paddw m3, m4 3660 paddw m0, m1 3661 vpblendvb m0, m6, m0, m2 3662 mova [rsp+32*3], m0 3663 dec wd 3664 jz .h64_transpose 3665 cmp r4d, maxbased 3666 jg .h64_loop 3667.h64_end_loop: 3668 sub rsp, 128 3669 mova [rsp+32*0], m6 3670 mova [rsp+32*1], m6 3671 mova [rsp+32*2], m6 3672 mova [rsp+32*3], m6 3673 dec wd 3674 jg .h64_end_loop 3675.h64_transpose: 3676 lea r2, [strideq*3] 3677 lea r3, [strideq*5] 3678 mov r5, dstq 3679 lea r4, [strideq+r2*2] 3680.h64_transpose_loop0: 3681 lea r6, [rsp+112] 3682 lea dstq, [r5+org_wq*2-32] 3683.h64_transpose_loop: 3684 mova xm0, [r6+128*15] 3685 vinserti128 m0, [r6+128* 7], 1 3686 mova xm1, [r6+128*14] 3687 vinserti128 m1, [r6+128* 6], 1 3688 mova xm2, [r6+128*13] 3689 vinserti128 m2, [r6+128* 5], 1 3690 mova xm3, [r6+128*12] 3691 vinserti128 m3, [r6+128* 4], 1 3692 mova xm4, [r6+128*11] 3693 vinserti128 m4, [r6+128* 3], 1 3694 mova xm5, [r6+128*10] 3695 vinserti128 m5, [r6+128* 2], 1 3696 mova xm6, [r6+128* 9] 3697 vinserti128 m6, [r6+128* 1], 1 3698 mova xm7, [r6+128* 8] 3699 vinserti128 m7, [r6+128* 0], 1 3700 punpckhwd m8, m0, m1 3701 punpcklwd m0, m1 3702 punpckhwd m1, m2, m3 3703 punpcklwd m2, m3 3704 punpckhwd m3, m4, m5 3705 punpcklwd m4, m5 3706 punpckhwd m5, m6, m7 3707 punpcklwd m6, m7 3708 sub r6, 16 3709 punpckhdq m7, m8, m1 3710 punpckldq m8, m1 3711 punpckhdq m1, m3, m5 3712 punpckldq m3, m5 3713 punpckhqdq m5, m7, m1 3714 punpcklqdq m7, m1 3715 punpckhqdq m1, m8, m3 3716 punpcklqdq m8, m3 3717 punpckhdq m3, m0, m2 3718 mova [dstq+strideq*0], m5 3719 punpckldq m0, m2 3720 mova [dstq+strideq*1], m7 3721 punpckhdq m2, m4, m6 3722 mova [dstq+strideq*2], m1 3723 punpckldq m4, m6 3724 mova [dstq+r2 ], m8 3725 punpckhqdq m6, m3, m2 3726 mova [dstq+strideq*4], m6 3727 punpcklqdq m3, m2 3728 mova [dstq+r3 ], m3 3729 punpckhqdq m2, m0, m4 3730 mova [dstq+r2*2 ], m2 3731 punpcklqdq m0, m4 3732 mova [dstq+r4 ], m0 3733 lea dstq, [dstq+strideq*8] 3734 cmp r6, rsp 3735 jae .h64_transpose_loop 3736 add rsp, 128*16 3737 sub org_wd, 16 3738 jg .h64_transpose_loop0 3739.h64_end: 3740 RET 3741 3742%macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax 3743%ifnum %4 3744 pshufb xm%2, xm%4 3745%else 3746 pshufb xm%2, %4 3747%endif 3748 vinserti128 m%2, xm%2, 1 3749 pshufd m%1, m%2, q0000 3750 pmaddwd m%1, m2 3751 pshufd m%3, m%2, q1111 3752 pmaddwd m%3, m3 3753 paddd m%1, m1 3754 paddd m%1, m%3 3755 pshufd m%3, m%2, q2222 3756 pmaddwd m%3, m4 3757 paddd m%1, m%3 3758 pshufd m%3, m%2, q3333 3759 pmaddwd m%3, m5 3760 paddd m%1, m%3 3761 psrad m%1, 4 3762 packusdw m%1, m%1 3763 pminsw m%1, m%5 3764%endmacro 3765 3766%macro FILTER_2BLK 7 ; dst, src, tmp_dst, tmp_src, tmp, shuf, bdmax 3767 pshufb m%2, m%6 3768 vpermq m%4, m%2, q3232 3769 vinserti128 m%2, xm%2, 1 3770 pshufd m%1, m%2, q0000 3771 pshufd m%3, m%4, q0000 3772 pmaddwd m%1, m2 3773 pmaddwd m%3, m2 3774 paddd m%1, m1 3775 paddd m%3, m1 3776 pshufd m%5, m%2, q1111 3777 pmaddwd m%5, m3 3778 paddd m%1, m%5 3779 pshufd m%5, m%4, q1111 3780 pmaddwd m%5, m3 3781 paddd m%3, m%5 3782 pshufd m%5, m%2, q2222 3783 pmaddwd m%5, m4 3784 paddd m%1, m%5 3785 pshufd m%5, m%4, q2222 3786 pmaddwd m%5, m4 3787 paddd m%3, m%5 3788 pshufd m%5, m%2, q3333 3789 pmaddwd m%5, m5 3790 paddd m%1, m%5 3791 pshufd m%5, m%4, q3333 3792 pmaddwd m%5, m5 3793 paddd m%3, m%5 3794 psrad m%1, 4 3795 psrad m%3, 4 3796 packusdw m%1, m%3 3797 pminsw m%1, m%7 3798%endmacro 3799 3800; The ipred_filter SIMD processes 4x2 blocks in the following order which 3801; increases parallelism compared to doing things row by row. One redundant 3802; block is calculated for w8 and w16, two for w32. 3803; w4 w8 w16 w32 3804; 1 1 2 1 2 3 5 1 2 3 5 b c d f 3805; 2 2 3 2 4 5 7 2 4 5 7 c e f h 3806; 3 3 4 4 6 7 9 4 6 7 9 e g h j 3807; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ 3808; 5 8 8 i 3809 3810cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter 3811%assign org_stack_offset stack_offset 3812%define base r6-ipred_filter_16bpc_avx2_table 3813 lea r6, [filter_intra_taps] 3814 tzcnt wd, wm 3815%ifidn filterd, filterm 3816 movzx filterd, filterb 3817%else 3818 movzx filterd, byte filterm 3819%endif 3820 shl filterd, 6 3821 add filterq, r6 3822 lea r6, [ipred_filter_16bpc_avx2_table] 3823 vbroadcasti128 m0, [tlq-6] 3824 movsxd wq, [r6+wq*4] 3825 vpbroadcastd m1, [base+pd_8] 3826 pmovsxbw m2, [filterq+16*0] 3827 pmovsxbw m3, [filterq+16*1] 3828 pmovsxbw m4, [filterq+16*2] 3829 pmovsxbw m5, [filterq+16*3] 3830 add wq, r6 3831 mov hd, hm 3832 jmp wq 3833.w4: 3834 WIN64_SPILL_XMM 10 3835 mova xm8, [base+filter_shuf2] 3836 vpbroadcastw m9, r8m ; bitdepth_max 3837 lea r7, [6+hq*2] 3838 sub tlq, r7 3839 jmp .w4_loop_start 3840.w4_loop: 3841 pinsrq xm0, [tlq+hq*2], 0 3842 lea dstq, [dstq+strideq*2] 3843.w4_loop_start: 3844 FILTER_1BLK 6, 0, 7, 8, 9 3845 vextracti128 xm0, m6, 1 3846 movq [dstq+strideq*0], xm6 3847 movq [dstq+strideq*1], xm0 3848 sub hd, 2 3849 jg .w4_loop 3850 RET 3851ALIGN function_align 3852.w8: 3853 %assign stack_offset stack_offset - stack_size_padded 3854 WIN64_SPILL_XMM 16 3855 vbroadcasti128 m14, [base+filter_shuf3] 3856 vpbroadcastw m15, r8m ; bitdepth_max 3857 FILTER_1BLK 10, 0, 7, [base+filter_shuf2], 15 3858 vpermq m6, m10, q1302 ; ____ ____ | ____ 4321 3859 pslldq m8, m0, 4 3860 psrldq m7, m6, 2 3861 psrldq m0, m6, 10 3862 punpcklwd m7, m0 3863 vpblendd m8, m6, 0x33 ; _0__ 4321 | ____ 4321 3864 vpblendd m8, m7, 0x40 ; _056 4321 | ____ 4321 3865 vpblendd m8, [tlq-6], 0x30 ; _056 4321 | ____ 4321 3866 lea r7, [16+hq*2] 3867 sub tlq, r7 3868 jmp .w8_loop_start 3869.w8_loop: 3870 vpermq m8, m9, q1302 ; ____ 4321 | ____ 4321 3871 vpermq m6, m9, q2031 3872 psrldq m0, m6, 2 3873 psrldq m6, 10 3874 punpcklwd m6, m0 3875 vpblendd m8, m7, 0x80 ; _0__ 4321 | ____ 4321 3876 vpblendd m8, m6, 0x40 ; _056 4321 | ____ 4321 3877 mova m10, m9 3878.w8_loop_start: 3879 vpblendd m8, [tlq+hq*2], 0x0C ; _056 4321 | _056 4321 3880 call .main 3881 vpblendd m10, m9, 0xCC 3882 mova [dstq+strideq*0], xm10 3883 vextracti128 [dstq+strideq*1], m10, 1 3884 lea dstq, [dstq+strideq*2] 3885 sub hd, 2 3886 jg .w8_loop 3887 RET 3888ALIGN function_align 3889.w16: 3890 %assign stack_offset stack_offset - stack_size_padded 3891 ALLOC_STACK 32, 16 3892 vpbroadcastw m15, r8m ; bitdepth_max 3893 sub hd, 2 3894 TAIL_CALL .w16_main, 0 3895.w16_main: 3896 mova xm10, [base+filter_shuf2] 3897 FILTER_1BLK 13, 0, 6, 10, 15 3898 vpermq m12, m13, q3120 3899 mova xm14, [base+filter_shuf3] 3900 vinserti128 m14, [base+filter_shuf1], 1 3901 vpbroadcastq m0, [tlq+10] 3902 vpblendd m0, [tlq-16], 0x4C ; ___0 4321 | _056 ____ 3903 psrldq m6, m12, 8 3904 vpblendd m0, m6, 0x03 ; ___0 4321 | _056 4321 3905 punpcklwd m6, m12 3906 vpblendd m0, m6, 0x80 ; 56_0 4321 | _056 4321 3907 FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 3908 vpblendd m13, m12, 0xCC 3909 vpermq m12, m12, q2031 ; 6___ 5___ 3910 psrldq xm6, xm12, 2 3911 psrldq xm8, xm12, 12 3912 vpblendd xm6, xm8, 0x01 3913 pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ 3914 FILTER_1BLK 11, 6, 8, 10, 15 3915 vpermq m11, m11, q3120 3916 pshufd m9, m11, q1032 3917 movu m8, [tlq+6] ; __43 210_ | ____ ____ 3918 pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ 3919 pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ 3920 vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 3921 mova [dstq+strideq*0], xm13 3922 vextracti128 [dstq+strideq*1], m13, 1 3923 lea r7, [20+hq*2] 3924 sub tlq, r7 3925 vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 3926 jmp .w16_loop_start 3927.w16_loop: 3928 vpermq m13, m13, q3322 3929 vpermq m11, m9, q2020 3930 vpermq m9, m9, q1302 3931 vpermq m6, m12, q0123 3932 psrldq m7, 4 3933 vpblendd m13, m10, 0xCC 3934 vpblendd m9, m7, 0x40 3935 mova m0, [rsp+8] 3936 mova [dstq+strideq*0], xm13 3937 vextracti128 [dstq+strideq*1], m13, 1 3938.w16_loop_start: 3939 mova m13, m12 3940 vpblendd m0, [tlq+hq*2], 0x0C 3941 psrldq m7, m12, 8 3942 punpcklwd m7, m12 3943 vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 3944 vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 3945 FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 3946 vpermq m12, m10, q2031 3947 mova [rsp+8], m0 3948 psrldq m8, m11, 8 3949 psrldq xm6, xm12, 2 3950 psrldq xm7, xm12, 10 3951 psrldq xm0, xm13, 2 3952 punpcklwd m8, m11 3953 punpcklwd xm7, xm6 3954 vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 3955 vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 3956 vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 3957 call .main 3958 vpermq m8, m11, q3120 3959 vpblendd m6, m8, m9, 0xCC 3960 mova [dstq+strideq*0+16], xm6 3961 vextracti128 [dstq+strideq*1+16], m6, 1 3962 lea dstq, [dstq+strideq*2] 3963 sub hd, 2 3964 jg .w16_loop 3965 vpermq m8, m9, q3120 3966 vextracti128 xm0, m8, 1 ; 4321 ____ 3967 pshufd xm11, xm11, q1032 3968 vpblendd xm0, xm11, 0x02 ; 4321 0___ 3969 psrldq xm6, xm8, 2 3970 psrldq xm7, xm8, 12 3971 pblendw xm0, xm6, 0x4 ; 4321 05__ 3972 pblendw xm0, xm7, 0x2 ; 4321 056_ 3973 FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 3974 vpermq m12, m13, q1302 3975 vpblendd m12, m10, 0xCC 3976 vpblendd m9, m6, 0xCC 3977 mova [dstq+strideq*0+ 0], xm12 3978 mova [dstq+strideq*0+16], xm9 3979 vextracti128 [dstq+strideq*1+ 0], m12, 1 3980 vextracti128 [dstq+strideq*1+16], m9, 1 3981 ret 3982ALIGN function_align 3983.w32: 3984 %assign stack_offset org_stack_offset 3985 ALLOC_STACK 64, 16 3986 vpbroadcastw m15, r8m ; bitdepth_max 3987 sub hd, 2 3988 lea r3, [dstq+32] 3989 lea r5d, [hd*2+20] 3990 call .w16_main 3991 mov dstq, r3 3992 lea tlq, [tlq+r5+32] 3993 sub r5d, 20 3994 shr r5d, 1 3995 sub r5d, 2 3996 lea r4, [dstq+strideq*2-2] 3997DEFINE_ARGS dst, stride, tl, stride3, left, h 3998 lea stride3q, [strideq*3] 3999 movu m8, [tlq-6] ; 4321 0___ 4000 mova xm10, [base+filter_shuf2] 4001 pinsrw xm0, xm8, [dstq+strideq*0-2], 2 4002 pinsrw xm0, xm0, [dstq+strideq*1-2], 1 ; 4321 056_ 4003 pinsrw xm9, [leftq+strideq*0], 5 4004 pinsrw xm9, [leftq+strideq*1], 4 4005 FILTER_1BLK 13, 0, 6, 10, 15 4006 vpermq m12, m13, q3120 4007 mova xm14, [base+filter_shuf3] 4008 vinserti128 m14, [base+filter_shuf1], 1 4009 psrldq m6, m12, 8 4010 punpcklwd m7, m6, m12 4011 vpblendd m0, m6, 0x03 ; ___0 ____ | _0__ 4321 4012 vpblendd m0, m7, 0x80 ; 56_0 ____ | _0__ 4321 4013 vpblendd m0, m8, 0x30 ; 56_0 4321 | _0__ 4321 4014 vpblendd m0, m9, 0x04 ; 56_0 4321 | _056 4321 4015 FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 4016 vpblendd m13, m12, 0xCC 4017 pinsrw xm9, [leftq+strideq*2], 3 4018 pinsrw xm9, [leftq+stride3q ], 2 4019 lea leftq, [leftq+strideq*4] 4020 pinsrw xm9, [leftq+strideq*0], 1 4021 pinsrw xm9, [leftq+strideq*1], 0 4022 movq [rsp+32], xm9 4023 mov r7d, 1 4024 pslldq m8, m9, 4 4025 vpblendd m0, m8, 0x0C ; ___0 ____ | _056 ____ 4026 vpermq m12, m12, q2031 ; 6___ 5___ 4027 psrldq xm6, xm12, 2 4028 psrldq xm7, xm12, 12 4029 vpblendd xm6, xm7, 0x01 ; ____ _56_ 4030 pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ 4031 FILTER_1BLK 11, 6, 7, 10, 15 4032 vpermq m11, m11, q3120 4033 pshufd m9, m11, q1032 4034 vbroadcasti128 m8, [tlq+22] ; __43 210_ | ____ ____ 4035 pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ 4036 pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ 4037 vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 4038 mova [dstq+strideq*0], xm13 4039 vextracti128 [dstq+strideq*1], m13, 1 4040 vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 4041 jmp .w32_loop_start 4042.w32_loop_last: 4043 mova m0, [rsp+0] 4044 jmp .w32_loop 4045.w32_loop_left: 4046 mova m0, [rsp+0] 4047 vpblendd m0, [rsp+32+r7*4-12], 0x0C 4048 dec r7d 4049 jg .w32_loop 4050 cmp hd, 2 4051 je .w32_loop 4052 pinsrw xm6, [rsp+32], 6 4053 pinsrw xm6, [leftq+strideq*2], 5 4054 pinsrw xm6, [leftq+stride3q ], 4 4055 lea leftq, [leftq+strideq*4] 4056 pinsrw xm6, [leftq+strideq*0], 3 4057 pinsrw xm6, [leftq+strideq*1], 2 4058 pinsrw xm6, [leftq+strideq*2], 1 4059 pinsrw xm6, [leftq+stride3q ], 0 4060 lea leftq, [leftq+strideq*4] 4061 movu [rsp+36], xm6 4062 pinsrw xm6, [leftq+strideq*0], 1 4063 pinsrw xm6, [leftq+strideq*1], 0 4064 movd [rsp+32], xm6 4065 mov r7d, 4 4066.w32_loop: 4067 vpermq m13, m13, q3322 4068 vpermq m11, m9, q2020 4069 vpermq m9, m9, q1302 4070 vpermq m6, m12, q0123 4071 psrldq m7, 4 4072 vpblendd m13, m10, 0xCC 4073 vpblendd m9, m7, 0x40 ; ___0 4321 | ____ 4321 4074 mova [dstq+strideq*0], xm13 4075 vextracti128 [dstq+strideq*1], m13, 1 4076.w32_loop_start: 4077 mova m13, m12 4078 psrldq m7, m12, 8 4079 punpcklwd m7, m12 4080 vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 4081 vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 4082 FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 4083 vpermq m12, m10, q2031 4084 mova [rsp+0], m0 4085 psrldq m8, m11, 8 4086 psrldq xm6, xm12, 2 4087 psrldq xm7, xm12, 10 4088 psrldq xm0, xm13, 2 4089 punpcklwd m8, m11 4090 punpcklwd xm7, xm6 4091 vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 4092 vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 4093 vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 4094 call .main 4095 vpermq m8, m11, q3120 4096 vpblendd m6, m8, m9, 0xCC 4097 mova [dstq+strideq*0+16], xm6 4098 vextracti128 [dstq+strideq*1+16], m6, 1 4099 lea dstq, [dstq+strideq*2] 4100 sub hd, 2 4101 jg .w32_loop_left 4102 jz .w32_loop_last 4103 vpermq m8, m9, q3120 4104 vextracti128 xm0, m8, 1 ; 4321 ____ 4105 pshufd xm11, xm11, q1032 4106 vpblendd xm0, xm11, 0x02 ; 4321 0___ 4107 psrldq xm6, xm8, 2 4108 psrldq xm7, xm8, 12 4109 pblendw xm0, xm6, 0x4 ; 4321 05__ 4110 pblendw xm0, xm7, 0x2 ; 4321 056_ 4111 FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 4112 vpermq m12, m13, q1302 4113 vpblendd m12, m10, 0xCC 4114 vpblendd m9, m6, 0xCC 4115 mova [dstq+strideq*0+ 0], xm12 4116 mova [dstq+strideq*0+16], xm9 4117 vextracti128 [dstq+strideq*1+ 0], m12, 1 4118 vextracti128 [dstq+strideq*1+16], m9, 1 4119 RET 4120.main: 4121 FILTER_2BLK 9, 8, 6, 7, 0, 14, 15 4122 ret 4123 4124%if WIN64 4125DECLARE_REG_TMP 5 4126%else 4127DECLARE_REG_TMP 7 4128%endif 4129 4130%macro IPRED_CFL 1 ; ac in, unpacked pixels out 4131 psignw m3, m%1, m1 4132 pabsw m%1, m%1 4133 pmulhrsw m%1, m2 4134 psignw m%1, m3 4135 paddw m%1, m0 4136%endmacro 4137 4138cglobal ipred_cfl_top_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha 4139 movifnidn hd, hm 4140 add tlq, 2 4141 movd xm4, wd 4142 pxor m6, m6 4143 vpbroadcastw m7, r7m 4144 pavgw xm4, xm6 4145 tzcnt wd, wd 4146 movd xm5, wd 4147 movu m0, [tlq] 4148 lea t0, [ipred_cfl_left_16bpc_avx2_table] 4149 movsxd r6, [t0+wq*4] 4150 add r6, t0 4151 add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table 4152 movsxd wq, [t0+wq*4] 4153 add wq, t0 4154 movifnidn acq, acmp 4155 jmp r6 4156 4157cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha 4158 mov hd, hm ; zero upper half 4159 sub tlq, hq 4160 movd xm4, hd 4161 sub tlq, hq 4162 pxor m6, m6 4163 vpbroadcastw m7, r7m 4164 pavgw xm4, xm6 4165 tzcnt r6d, hd 4166 movd xm5, r6d 4167 movu m0, [tlq] 4168 lea t0, [ipred_cfl_left_16bpc_avx2_table] 4169 movsxd r6, [t0+r6*4] 4170 add r6, t0 4171 add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table 4172 tzcnt wd, wd 4173 movsxd wq, [t0+wq*4] 4174 add wq, t0 4175 movifnidn acq, acmp 4176 jmp r6 4177.h32: 4178 paddw m0, [tlq+32] 4179.h16: 4180 vextracti128 xm1, m0, 1 4181 paddw xm0, xm1 4182.h8: 4183 psrldq xm1, xm0, 8 4184 paddw xm0, xm1 4185.h4: 4186 punpcklwd xm0, xm6 4187 psrlq xm1, xm0, 32 4188 paddd xm0, xm1 4189 psrldq xm1, xm0, 8 4190 paddd xm0, xm1 4191 paddd xm0, xm4 4192 psrld xm0, xm5 4193 vpbroadcastw m0, xm0 4194 jmp wq 4195 4196cglobal ipred_cfl_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha 4197 movifnidn hd, hm 4198 movifnidn wd, wm 4199 tzcnt r6d, hd 4200 lea t0d, [wq+hq] 4201 movd xm4, t0d 4202 tzcnt t0d, t0d 4203 movd xm5, t0d 4204 lea t0, [ipred_cfl_16bpc_avx2_table] 4205 tzcnt wd, wd 4206 movsxd r6, [t0+r6*4] 4207 movsxd wq, [t0+wq*4+4*4] 4208 psrlw xm4, 1 4209 pxor m6, m6 4210 vpbroadcastw m7, r7m 4211 add r6, t0 4212 add wq, t0 4213 movifnidn acq, acmp 4214 jmp r6 4215.h4: 4216 movq xm0, [tlq-8] 4217 jmp wq 4218.w4: 4219 movq xm1, [tlq+2] 4220 paddw m0, m4 4221 paddw m0, m1 4222 psrlq m1, m0, 32 4223 paddw m0, m1 4224 psrld m1, m0, 16 4225 paddw m0, m1 4226 cmp hd, 4 4227 jg .w4_mul 4228 psrlw xm0, 3 4229 jmp .w4_end 4230.w4_mul: 4231 vextracti128 xm1, m0, 1 4232 paddw xm0, xm1 4233 lea r2d, [hq*2] 4234 mov r6d, 0xAAAB6667 4235 shrx r6d, r6d, r2d 4236 punpckhwd xm1, xm0, xm6 4237 punpcklwd xm0, xm6 4238 paddd xm0, xm1 4239 movd xm1, r6d 4240 psrld xm0, 2 4241 pmulhuw xm0, xm1 4242 psrlw xm0, 1 4243.w4_end: 4244 vpbroadcastw m0, xm0 4245.s4: 4246 vpbroadcastw m1, alpham 4247 lea r6, [strideq*3] 4248 pabsw m2, m1 4249 psllw m2, 9 4250.s4_loop: 4251 mova m4, [acq] 4252 IPRED_CFL 4 4253 pmaxsw m4, m6 4254 pminsw m4, m7 4255 vextracti128 xm5, m4, 1 4256 movq [dstq+strideq*0], xm4 4257 movq [dstq+strideq*2], xm5 4258 movhps [dstq+strideq*1], xm4 4259 movhps [dstq+r6 ], xm5 4260 lea dstq, [dstq+strideq*4] 4261 add acq, 32 4262 sub hd, 4 4263 jg .s4_loop 4264 RET 4265ALIGN function_align 4266.h8: 4267 mova xm0, [tlq-16] 4268 jmp wq 4269.w8: 4270 vextracti128 xm1, m0, 1 4271 paddw xm0, [tlq+2] 4272 paddw xm0, xm4 4273 paddw xm0, xm1 4274 psrld xm1, xm0, 16 4275 paddw xm0, xm1 4276 pblendw xm0, xm6, 0xAA 4277 psrlq xm1, xm0, 32 4278 paddd xm0, xm1 4279 psrldq xm1, xm0, 8 4280 paddd xm0, xm1 4281 psrld xm0, xm5 4282 cmp hd, 8 4283 je .w8_end 4284 mov r6d, 0xAAAB 4285 mov r2d, 0x6667 4286 cmp hd, 32 4287 cmovz r6d, r2d 4288 movd xm1, r6d 4289 pmulhuw xm0, xm1 4290 psrlw xm0, 1 4291.w8_end: 4292 vpbroadcastw m0, xm0 4293.s8: 4294 vpbroadcastw m1, alpham 4295 lea r6, [strideq*3] 4296 pabsw m2, m1 4297 psllw m2, 9 4298.s8_loop: 4299 mova m4, [acq] 4300 mova m5, [acq+32] 4301 IPRED_CFL 4 4302 IPRED_CFL 5 4303 pmaxsw m4, m6 4304 pmaxsw m5, m6 4305 pminsw m4, m7 4306 pminsw m5, m7 4307 mova [dstq+strideq*0], xm4 4308 mova [dstq+strideq*2], xm5 4309 vextracti128 [dstq+strideq*1], m4, 1 4310 vextracti128 [dstq+r6 ], m5, 1 4311 lea dstq, [dstq+strideq*4] 4312 add acq, 64 4313 sub hd, 4 4314 jg .s8_loop 4315 RET 4316ALIGN function_align 4317.h16: 4318 mova m0, [tlq-32] 4319 jmp wq 4320.w16: 4321 paddw m0, [tlq+2] 4322 vextracti128 xm1, m0, 1 4323 paddw xm0, xm4 4324 paddw xm0, xm1 4325 punpckhwd xm1, xm0, xm6 4326 punpcklwd xm0, xm6 4327 paddd xm0, xm1 4328 psrlq xm1, xm0, 32 4329 paddd xm0, xm1 4330 psrldq xm1, xm0, 8 4331 paddd xm0, xm1 4332 psrld xm0, xm5 4333 cmp hd, 16 4334 je .w16_end 4335 mov r6d, 0xAAAB 4336 mov r2d, 0x6667 4337 test hb, 8|32 4338 cmovz r6d, r2d 4339 movd xm1, r6d 4340 pmulhuw xm0, xm1 4341 psrlw xm0, 1 4342.w16_end: 4343 vpbroadcastw m0, xm0 4344.s16: 4345 vpbroadcastw m1, alpham 4346 pabsw m2, m1 4347 psllw m2, 9 4348.s16_loop: 4349 mova m4, [acq] 4350 mova m5, [acq+32] 4351 IPRED_CFL 4 4352 IPRED_CFL 5 4353 pmaxsw m4, m6 4354 pmaxsw m5, m6 4355 pminsw m4, m7 4356 pminsw m5, m7 4357 mova [dstq+strideq*0], m4 4358 mova [dstq+strideq*1], m5 4359 lea dstq, [dstq+strideq*2] 4360 add acq, 64 4361 sub hd, 2 4362 jg .s16_loop 4363 RET 4364ALIGN function_align 4365.h32: 4366 mova m0, [tlq-64] 4367 paddw m0, [tlq-32] 4368 jmp wq 4369.w32: 4370 paddw m0, [tlq+ 2] 4371 paddw m0, [tlq+34] 4372 vextracti128 xm1, m0, 1 4373 paddw xm0, xm4 4374 paddw xm0, xm1 4375 punpcklwd xm1, xm0, xm6 4376 punpckhwd xm0, xm6 4377 paddd xm0, xm1 4378 psrlq xm1, xm0, 32 4379 paddd xm0, xm1 4380 psrldq xm1, xm0, 8 4381 paddd xm0, xm1 4382 psrld xm0, xm5 4383 cmp hd, 32 4384 je .w32_end 4385 lea r2d, [hq*2] 4386 mov r6d, 0x6667AAAB 4387 shrx r6d, r6d, r2d 4388 movd xm1, r6d 4389 pmulhuw xm0, xm1 4390 psrlw xm0, 1 4391.w32_end: 4392 vpbroadcastw m0, xm0 4393.s32: 4394 vpbroadcastw m1, alpham 4395 pabsw m2, m1 4396 psllw m2, 9 4397.s32_loop: 4398 mova m4, [acq] 4399 mova m5, [acq+32] 4400 IPRED_CFL 4 4401 IPRED_CFL 5 4402 pmaxsw m4, m6 4403 pmaxsw m5, m6 4404 pminsw m4, m7 4405 pminsw m5, m7 4406 mova [dstq+32*0], m4 4407 mova [dstq+32*1], m5 4408 add dstq, strideq 4409 add acq, 64 4410 dec hd 4411 jg .s32_loop 4412 RET 4413 4414cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha 4415 mov r6d, r7m 4416 shr r6d, 11 4417 lea t0, [ipred_cfl_splat_16bpc_avx2_table] 4418 tzcnt wd, wd 4419 movifnidn hd, hm 4420 movsxd wq, [t0+wq*4] 4421 vpbroadcastd m0, [t0-ipred_cfl_splat_16bpc_avx2_table+pw_512+r6*4] 4422 pxor m6, m6 4423 vpbroadcastw m7, r7m 4424 add wq, t0 4425 movifnidn acq, acmp 4426 jmp wq 4427 4428cglobal ipred_cfl_ac_420_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h 4429 movifnidn hpadd, hpadm 4430 vpbroadcastd m5, [pw_2] 4431 mov hd, hm 4432 shl hpadd, 2 4433 pxor m4, m4 4434 sub hd, hpadd 4435 cmp dword wm, 8 4436 jg .w16 4437 je .w8 4438.w4: 4439 lea r3, [strideq*3] 4440 mov r5, acq 4441.w4_loop: 4442 mova xm0, [ypxq+strideq*2] 4443 mova xm1, [ypxq+r3 ] 4444 vinserti128 m0, [ypxq+strideq*0], 1 4445 vinserti128 m1, [ypxq+strideq*1], 1 4446 lea ypxq, [ypxq+strideq*4] 4447 pmaddwd m0, m5 4448 pmaddwd m1, m5 4449 paddd m0, m1 4450 vextracti128 xm1, m0, 1 4451 paddd m4, m0 4452 packssdw xm1, xm0 4453 mova [acq], xm1 4454 add acq, 16 4455 sub hd, 2 4456 jg .w4_loop 4457 test hpadd, hpadd 4458 jz .dc 4459 vpermq m1, m1, q1111 4460 pslld xm0, 2 4461.w4_hpad_loop: 4462 mova [acq], m1 4463 paddd m4, m0 4464 add acq, 32 4465 sub hpadd, 4 4466 jg .w4_hpad_loop 4467 jmp .dc 4468.w8: 4469 mov r5, acq 4470 test wpadd, wpadd 4471 jnz .w8_wpad1 4472.w8_loop: 4473 pmaddwd m0, m5, [ypxq+strideq*0] 4474 pmaddwd m1, m5, [ypxq+strideq*1] 4475 lea ypxq, [ypxq+strideq*2] 4476 paddd m0, m1 4477 vextracti128 xm1, m0, 1 4478 paddd m4, m0 4479 packssdw xm1, xm0, xm1 4480 mova [acq], xm1 4481 add acq, 16 4482 dec hd 4483 jg .w8_loop 4484.w8_hpad: 4485 test hpadd, hpadd 4486 jz .dc 4487 vinserti128 m1, xm1, 1 4488 pslld m0, 2 4489 jmp .hpad 4490.w8_wpad1: 4491 pmaddwd xm0, xm5, [ypxq+strideq*0] 4492 pmaddwd xm3, xm5, [ypxq+strideq*1] 4493 lea ypxq, [ypxq+strideq*2] 4494 paddd xm0, xm3 4495 pshufd xm3, xm0, q3333 4496 packssdw xm1, xm0, xm3 4497 paddd xm0, xm3 4498 paddd xm4, xm0 4499 mova [acq], xm1 4500 add acq, 16 4501 dec hd 4502 jg .w8_wpad1 4503 jmp .w8_hpad 4504.w16_wpad: 4505 mova m0, [ypxq+strideq*0+ 0] 4506 mova m1, [ypxq+strideq*1+ 0] 4507 cmp wpadd, 2 4508 jl .w16_wpad1 4509 je .w16_wpad2 4510 vpbroadcastd m2, [ypxq+strideq*0+12] 4511 vpbroadcastd m3, [ypxq+strideq*1+12] 4512 vpblendd m0, m2, 0xf0 4513 vpblendd m1, m3, 0xf0 4514 jmp .w16_wpad_end 4515.w16_wpad2: 4516 vpbroadcastd m2, [ypxq+strideq*0+28] 4517 vpbroadcastd m3, [ypxq+strideq*1+28] 4518 jmp .w16_wpad_end 4519.w16_wpad1: 4520 vpbroadcastd m2, [ypxq+strideq*0+44] 4521 vpbroadcastd m3, [ypxq+strideq*1+44] 4522 vinserti128 m2, [ypxq+strideq*0+32], 0 4523 vinserti128 m3, [ypxq+strideq*1+32], 0 4524.w16_wpad_end: 4525 lea ypxq, [ypxq+strideq*2] 4526 REPX {pmaddwd x, m5}, m0, m1, m2, m3 4527 paddd m0, m1 4528 paddd m2, m3 4529 packssdw m1, m0, m2 4530 paddd m0, m2 4531 vpermq m1, m1, q3120 4532 paddd m4, m0 4533 mova [acq], m1 4534 add acq, 32 4535 dec hd 4536 jg .w16_wpad 4537 jmp .w16_hpad 4538.w16: 4539 mov r5, acq 4540 test wpadd, wpadd 4541 jnz .w16_wpad 4542.w16_loop: 4543 pmaddwd m0, m5, [ypxq+strideq*0+ 0] 4544 pmaddwd m2, m5, [ypxq+strideq*0+32] 4545 pmaddwd m1, m5, [ypxq+strideq*1+ 0] 4546 pmaddwd m3, m5, [ypxq+strideq*1+32] 4547 lea ypxq, [ypxq+strideq*2] 4548 paddd m0, m1 4549 paddd m2, m3 4550 packssdw m1, m0, m2 4551 paddd m0, m2 4552 vpermq m1, m1, q3120 4553 paddd m4, m0 4554 mova [acq], m1 4555 add acq, 32 4556 dec hd 4557 jg .w16_loop 4558.w16_hpad: 4559 add hpadd, hpadd 4560 jz .dc 4561 paddd m0, m0 4562.hpad: 4563 mova [acq+32*0], m1 4564 paddd m4, m0 4565 mova [acq+32*1], m1 4566 add acq, 32*2 4567 sub hpadd, 4 4568 jg .hpad 4569.dc: 4570 vextracti128 xm1, m4, 1 4571 sub r5, acq ; -w*h*2 4572 tzcnt r1d, r5d 4573 paddd xm4, xm1 4574 sub r1d, 2 4575 punpckhqdq xm1, xm4, xm4 4576 movd xm0, r1d 4577 paddd xm1, xm4 4578 pshuflw xm4, xm1, q1032 4579 paddd xm1, xm4 4580 psrld xm1, xm0 4581 pxor xm0, xm0 4582 pavgw xm1, xm0 4583 vpbroadcastw m1, xm1 4584.dc_loop: 4585 mova m0, [acq+r5] 4586 psubw m0, m1 4587 mova [acq+r5], m0 4588 add r5, 32 4589 jl .dc_loop 4590 RET 4591 4592cglobal ipred_cfl_ac_422_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h 4593 movifnidn hpadd, hpadm 4594 vpbroadcastd m5, [pw_4] 4595 mov hd, hm 4596 shl hpadd, 2 4597 pxor m4, m4 4598 sub hd, hpadd 4599 cmp dword wm, 8 4600 jg .w16 4601 je .w8 4602.w4: 4603 lea r3, [strideq*3] 4604 mov r5, acq 4605.w4_loop: 4606 mova xm0, [ypxq+strideq*0] 4607 mova xm1, [ypxq+strideq*1] 4608 vinserti128 m0, [ypxq+strideq*2], 1 4609 vinserti128 m1, [ypxq+r3 ], 1 4610 lea ypxq, [ypxq+strideq*4] 4611 pmaddwd m0, m5 4612 pmaddwd m1, m5 4613 paddd m4, m0 4614 packssdw m0, m1 4615 paddd m4, m1 4616 mova [acq], m0 4617 add acq, 32 4618 sub hd, 4 4619 jg .w4_loop 4620 test hpadd, hpadd 4621 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4622 vextracti128 xm1, m1, 1 4623 vpermq m0, m0, q3333 4624 pslld xm1, 2 4625.w4_hpad_loop: 4626 mova [acq], m0 4627 paddd m4, m1 4628 add acq, 32 4629 sub hpadd, 4 4630 jg .w4_hpad_loop 4631 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4632.w8: 4633 mov r5, acq 4634 test wpadd, wpadd 4635 jnz .w8_wpad1 4636.w8_loop: 4637 pmaddwd m1, m5, [ypxq+strideq*0] 4638 pmaddwd m0, m5, [ypxq+strideq*1] 4639 lea ypxq, [ypxq+strideq*2] 4640 paddd m4, m1 4641 packssdw m1, m0 4642 paddd m4, m0 4643 vpermq m2, m1, q3120 4644 mova [acq], m2 4645 add acq, 32 4646 sub hd, 2 4647 jg .w8_loop 4648.w8_hpad: 4649 test hpadd, hpadd 4650 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4651 vpermq m1, m1, q3131 4652 pslld m0, 2 4653 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad 4654.w8_wpad1: 4655 vpbroadcastd m1, [ypxq+strideq*0+12] 4656 vpbroadcastd m0, [ypxq+strideq*1+12] 4657 vinserti128 m1, [ypxq+strideq*0+ 0], 0 4658 vinserti128 m0, [ypxq+strideq*1+ 0], 0 4659 lea ypxq, [ypxq+strideq*2] 4660 pmaddwd m1, m5 4661 pmaddwd m0, m5 4662 paddd m4, m1 4663 packssdw m1, m0 4664 paddd m4, m0 4665 vpermq m2, m1, q3120 4666 mova [acq], m2 4667 add acq, 32 4668 sub hd, 2 4669 jg .w8_wpad1 4670 jmp .w8_hpad 4671.w16: 4672 mov r5, acq 4673 test wpadd, wpadd 4674 jnz .w16_wpad 4675.w16_loop: 4676 pmaddwd m2, m5, [ypxq+strideq*0+ 0] 4677 pmaddwd m1, m5, [ypxq+strideq*0+32] 4678 pmaddwd m0, m5, [ypxq+strideq*1+ 0] 4679 pmaddwd m3, m5, [ypxq+strideq*1+32] 4680 lea ypxq, [ypxq+strideq*2] 4681 paddd m4, m2 4682 packssdw m2, m1 4683 paddd m4, m1 4684 packssdw m1, m0, m3 4685 paddd m0, m3 4686 vpermq m2, m2, q3120 4687 paddd m4, m0 4688 vpermq m1, m1, q3120 4689 mova [acq+32*0], m2 4690 mova [acq+32*1], m1 4691 add acq, 32*2 4692 sub hd, 2 4693 jg .w16_loop 4694 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad 4695.w16_wpad: 4696 mova m2, [ypxq+strideq*0+ 0] 4697 mova m0, [ypxq+strideq*1+ 0] 4698 cmp wpadd, 2 4699 jl .w16_wpad1 4700 je .w16_wpad2 4701 vpbroadcastd m1, [ypxq+strideq*0+12] 4702 vpbroadcastd m3, [ypxq+strideq*1+12] 4703 vpblendd m2, m1, 0xf0 4704 vpblendd m0, m3, 0xf0 4705 jmp .w16_wpad_end 4706.w16_wpad2: 4707 vpbroadcastd m1, [ypxq+strideq*0+28] 4708 vpbroadcastd m3, [ypxq+strideq*1+28] 4709 jmp .w16_wpad_end 4710.w16_wpad1: 4711 vpbroadcastd m1, [ypxq+strideq*0+44] 4712 vpbroadcastd m3, [ypxq+strideq*1+44] 4713 vinserti128 m1, [ypxq+strideq*0+32], 0 4714 vinserti128 m3, [ypxq+strideq*1+32], 0 4715.w16_wpad_end: 4716 lea ypxq, [ypxq+strideq*2] 4717 REPX {pmaddwd x, m5}, m2, m0, m1, m3 4718 paddd m4, m2 4719 packssdw m2, m1 4720 paddd m4, m1 4721 packssdw m1, m0, m3 4722 paddd m0, m3 4723 vpermq m2, m2, q3120 4724 paddd m4, m0 4725 vpermq m1, m1, q3120 4726 mova [acq+32*0], m2 4727 mova [acq+32*1], m1 4728 add acq, 32*2 4729 sub hd, 2 4730 jg .w16_wpad 4731 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad 4732 4733cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h 4734 lea r6, [ipred_cfl_ac_444_16bpc_avx2_table] 4735 tzcnt wd, wm 4736 movifnidn hpadd, hpadm 4737 vpbroadcastd m5, [pw_1] 4738 movsxd wq, [r6+wq*4] 4739 shl hpadd, 2 4740 add wq, r6 4741 mov hd, hm 4742 pxor m4, m4 4743 sub hd, hpadd 4744 jmp wq 4745.w4: 4746 lea r3, [strideq*3] 4747 mov r5, acq 4748.w4_loop: 4749 movq xm0, [ypxq+strideq*0] 4750 movhps xm0, [ypxq+strideq*1] 4751 vpbroadcastq m1, [ypxq+strideq*2] 4752 vpbroadcastq m2, [ypxq+r3 ] 4753 lea ypxq, [ypxq+strideq*4] 4754 vpblendd m0, m1, 0x30 4755 vpblendd m0, m2, 0xc0 4756 psllw m0, 3 4757 pmaddwd m1, m0, m5 4758 mova [acq], m0 4759 add acq, 32 4760 paddd m4, m1 4761 sub hd, 4 4762 jg .w4_loop 4763 test hpadd, hpadd 4764 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4765 vpermq m0, m0, q3333 4766 paddd m1, m1 4767 mova [acq+32*0], m0 4768 vpermq m1, m1, q3333 4769 mova [acq+32*1], m0 4770 add acq, 32*2 4771 paddd m4, m1 4772 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4773.w8: 4774 lea r3, [strideq*3] 4775 mov r5, acq 4776.w8_loop: 4777 mova xm2, [ypxq+strideq*0] 4778 vinserti128 m2, [ypxq+strideq*1], 1 4779 mova xm1, [ypxq+strideq*2] 4780 vinserti128 m1, [ypxq+r3 ], 1 4781 lea ypxq, [ypxq+strideq*4] 4782 psllw m2, 3 4783 psllw m1, 3 4784 mova [acq+32*0], m2 4785 pmaddwd m2, m5 4786 mova [acq+32*1], m1 4787 pmaddwd m0, m1, m5 4788 add acq, 32*2 4789 paddd m4, m2 4790 paddd m4, m0 4791 sub hd, 4 4792 jg .w8_loop 4793 test hpadd, hpadd 4794 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4795 vperm2i128 m1, m1, 0x11 4796 pslld m0, 2 4797 pxor m2, m2 4798 vpblendd m0, m2, 0x0f 4799 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad 4800.w16_wpad2: 4801 vpbroadcastw m3, [ypxq+strideq*0+14] 4802 vpbroadcastw m0, [ypxq+strideq*1+14] 4803 vpblendd m2, m3, 0xf0 4804 vpblendd m1, m0, 0xf0 4805 jmp .w16_wpad_end 4806.w16: 4807 mov r5, acq 4808.w16_loop: 4809 mova m2, [ypxq+strideq*0] 4810 mova m1, [ypxq+strideq*1] 4811 test wpadd, wpadd 4812 jnz .w16_wpad2 4813.w16_wpad_end: 4814 lea ypxq, [ypxq+strideq*2] 4815 psllw m2, 3 4816 psllw m1, 3 4817 mova [acq+32*0], m2 4818 pmaddwd m2, m5 4819 mova [acq+32*1], m1 4820 pmaddwd m0, m1, m5 4821 add acq, 32*2 4822 paddd m4, m2 4823 paddd m4, m0 4824 sub hd, 2 4825 jg .w16_loop 4826 add hpadd, hpadd 4827 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4828 paddd m0, m0 4829 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad 4830.w32: 4831 mov r5, acq 4832 test wpadd, wpadd 4833 jnz .w32_wpad 4834.w32_loop: 4835 mova m0, [ypxq+ 0] 4836 mova m1, [ypxq+32] 4837 add ypxq, strideq 4838 psllw m0, 3 4839 psllw m1, 3 4840 pmaddwd m2, m0, m5 4841 mova [acq+32*0], m0 4842 pmaddwd m3, m1, m5 4843 mova [acq+32*1], m1 4844 add acq, 32*2 4845 paddd m2, m3 4846 paddd m4, m2 4847 dec hd 4848 jg .w32_loop 4849.w32_hpad: 4850 test hpadd, hpadd 4851 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4852 paddd m2, m2 4853.w32_hpad_loop: 4854 mova [acq+32*0], m0 4855 mova [acq+32*1], m1 4856 paddd m4, m2 4857 mova [acq+32*2], m0 4858 mova [acq+32*3], m1 4859 add acq, 32*4 4860 sub hpadd, 2 4861 jg .w32_hpad_loop 4862 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc 4863.w32_wpad: 4864 mova m0, [ypxq+ 0] 4865 cmp wpadd, 4 4866 jl .w32_wpad2 4867 je .w32_wpad4 4868 vpbroadcastw m1, [ypxq+14] 4869 vpblendd m0, m1, 0xf0 4870 jmp .w32_wpad_end 4871.w32_wpad4: 4872 vpbroadcastw m1, [ypxq+30] 4873 jmp .w32_wpad_end 4874.w32_wpad2: 4875 vpbroadcastw m1, [ypxq+46] 4876 vinserti128 m1, [ypxq+32], 0 4877.w32_wpad_end: 4878 add ypxq, strideq 4879 psllw m0, 3 4880 psllw m1, 3 4881 pmaddwd m2, m0, m5 4882 mova [acq+32*0], m0 4883 pmaddwd m3, m1, m5 4884 mova [acq+32*1], m1 4885 add acq, 32*2 4886 paddd m2, m3 4887 paddd m4, m2 4888 dec hd 4889 jg .w32_wpad 4890 jmp .w32_hpad 4891 4892cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h 4893 vbroadcasti128 m3, [palq] 4894 lea r2, [pal_pred_16bpc_avx2_table] 4895 tzcnt wd, wm 4896 vbroadcasti128 m4, [pal_pred_shuf] 4897 movifnidn hd, hm 4898 movsxd wq, [r2+wq*4] 4899 pshufb m3, m4 4900 punpckhqdq m4, m3, m3 4901 add wq, r2 4902DEFINE_ARGS dst, stride, stride3, idx, w, h 4903 lea stride3q, [strideq*3] 4904 jmp wq 4905.w4: 4906 mova xm2, [idxq] 4907 add idxq, 16 4908 pshufb xm1, xm3, xm2 4909 pshufb xm2, xm4, xm2 4910 punpcklbw xm0, xm1, xm2 4911 punpckhbw xm1, xm2 4912 movq [dstq+strideq*0], xm0 4913 movq [dstq+strideq*2], xm1 4914 movhps [dstq+strideq*1], xm0 4915 movhps [dstq+stride3q ], xm1 4916 lea dstq, [dstq+strideq*4] 4917 sub hd, 4 4918 jg .w4 4919 RET 4920.w8: 4921 movu m2, [idxq] ; only 16-byte alignment 4922 add idxq, 32 4923 pshufb m1, m3, m2 4924 pshufb m2, m4, m2 4925 punpcklbw m0, m1, m2 4926 punpckhbw m1, m2 4927 mova [dstq+strideq*0], xm0 4928 mova [dstq+strideq*1], xm1 4929 vextracti128 [dstq+strideq*2], m0, 1 4930 vextracti128 [dstq+stride3q ], m1, 1 4931 lea dstq, [dstq+strideq*4] 4932 sub hd, 4 4933 jg .w8 4934 RET 4935.w16: 4936 vpermq m2, [idxq+ 0], q3120 4937 vpermq m5, [idxq+32], q3120 4938 add idxq, 64 4939 pshufb m1, m3, m2 4940 pshufb m2, m4, m2 4941 punpcklbw m0, m1, m2 4942 punpckhbw m1, m2 4943 mova [dstq+strideq*0], m0 4944 mova [dstq+strideq*1], m1 4945 pshufb m1, m3, m5 4946 pshufb m2, m4, m5 4947 punpcklbw m0, m1, m2 4948 punpckhbw m1, m2 4949 mova [dstq+strideq*2], m0 4950 mova [dstq+stride3q ], m1 4951 lea dstq, [dstq+strideq*4] 4952 sub hd, 4 4953 jg .w16 4954 RET 4955.w32: 4956 vpermq m2, [idxq+ 0], q3120 4957 vpermq m5, [idxq+32], q3120 4958 add idxq, 64 4959 pshufb m1, m3, m2 4960 pshufb m2, m4, m2 4961 punpcklbw m0, m1, m2 4962 punpckhbw m1, m2 4963 mova [dstq+strideq*0+ 0], m0 4964 mova [dstq+strideq*0+32], m1 4965 pshufb m1, m3, m5 4966 pshufb m2, m4, m5 4967 punpcklbw m0, m1, m2 4968 punpckhbw m1, m2 4969 mova [dstq+strideq*1+ 0], m0 4970 mova [dstq+strideq*1+32], m1 4971 lea dstq, [dstq+strideq*2] 4972 sub hd, 2 4973 jg .w32 4974 RET 4975.w64: 4976 vpermq m2, [idxq+ 0], q3120 4977 vpermq m5, [idxq+32], q3120 4978 add idxq, 64 4979 pshufb m1, m3, m2 4980 pshufb m2, m4, m2 4981 punpcklbw m0, m1, m2 4982 punpckhbw m1, m2 4983 mova [dstq+ 0], m0 4984 mova [dstq+32], m1 4985 pshufb m1, m3, m5 4986 pshufb m2, m4, m5 4987 punpcklbw m0, m1, m2 4988 punpckhbw m1, m2 4989 mova [dstq+64], m0 4990 mova [dstq+96], m1 4991 add dstq, strideq 4992 dec hd 4993 jg .w64 4994 RET 4995 4996%endif 4997