1; Copyright © 2018-2020, VideoLAN and dav1d authors 2; Copyright © 2018-2020, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 32 32 33; dav1d_obmc_masks[] with 64-x interleaved 34obmc_masks: db 0, 0, 0, 0 35 ; 2 36 db 45, 19, 64, 0 37 ; 4 38 db 39, 25, 50, 14, 59, 5, 64, 0 39 ; 8 40 db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 41 ; 16 42 db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 43 db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 44 ; 32 45 db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 46 db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 47 db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 48 db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 49 50warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 51 db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 52warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 53 db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 54subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 55 db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 56subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 57subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 58subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 59subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 60subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 61subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 62bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 63bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 64bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 65deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 66blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 67pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8 68bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 69wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 70rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 71resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 72 db 7, 7, 7, 7, 7, 7, 7, 7 73 74wm_420_sign: dd 0x01020102, 0x01010101 75wm_422_sign: dd 0x80808080, 0x7f7f7f7f 76 77pb_64: times 4 db 64 78pw_m256: times 2 dw -256 79pw_15: times 2 dw 15 80pw_32: times 2 dw 32 81pw_34: times 2 dw 34 82pw_258: times 2 dw 258 83pw_512: times 2 dw 512 84pw_1024: times 2 dw 1024 85pw_2048: times 2 dw 2048 86pw_6903: times 2 dw 6903 87pw_8192: times 2 dw 8192 88pd_32: dd 32 89pd_63: dd 63 90pd_512: dd 512 91pd_32768: dd 32768 92pd_0x3ff: dd 0x3ff 93pd_0x4000: dd 0x4000 94pq_0x40000000: dq 0x40000000 95 96cextern mc_subpel_filters 97cextern mc_warp_filter 98cextern resize_filter 99 100%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) 101 102%macro BASE_JMP_TABLE 3-* 103 %xdefine %1_%2_table (%%table - %3) 104 %xdefine %%base %1_%2 105 %%table: 106 %rep %0 - 2 107 dw %%base %+ _w%3 - %%base 108 %rotate 1 109 %endrep 110%endmacro 111 112%macro HV_JMP_TABLE 5-* 113 %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3) 114 %xdefine %%base %1_%3 115 %assign %%types %4 116 %if %%types & 1 117 %xdefine %1_%2_h_%3_table (%%h - %5) 118 %%h: 119 %rep %0 - 4 120 dw %%prefix %+ .h_w%5 - %%base 121 %rotate 1 122 %endrep 123 %rotate 4 124 %endif 125 %if %%types & 2 126 %xdefine %1_%2_v_%3_table (%%v - %5) 127 %%v: 128 %rep %0 - 4 129 dw %%prefix %+ .v_w%5 - %%base 130 %rotate 1 131 %endrep 132 %rotate 4 133 %endif 134 %if %%types & 4 135 %xdefine %1_%2_hv_%3_table (%%hv - %5) 136 %%hv: 137 %rep %0 - 4 138 dw %%prefix %+ .hv_w%5 - %%base 139 %rotate 1 140 %endrep 141 %endif 142%endmacro 143 144%macro BIDIR_JMP_TABLE 1-* 145 %xdefine %1_table (%%table - 2*%2) 146 %xdefine %%base %1_table 147 %xdefine %%prefix mangle(private_prefix %+ _%1) 148 %%table: 149 %rep %0 - 1 150 dd %%prefix %+ .w%2 - %%base 151 %rotate 1 152 %endrep 153%endmacro 154 155%macro SCALED_JMP_TABLE 1-* 156 %xdefine %1_table (%%table - %2) 157 %xdefine %%base mangle(private_prefix %+ _%1) 158%%table: 159 %rep %0 - 1 160 dw %%base %+ .w%2 - %%base 161 %rotate 1 162 %endrep 163 %rotate 1 164%%dy_1024: 165 %xdefine %1_dy1_table (%%dy_1024 - %2) 166 %rep %0 - 1 167 dw %%base %+ .dy1_w%2 - %%base 168 %rotate 1 169 %endrep 170 %rotate 1 171%%dy_2048: 172 %xdefine %1_dy2_table (%%dy_2048 - %2) 173 %rep %0 - 1 174 dw %%base %+ .dy2_w%2 - %%base 175 %rotate 1 176 %endrep 177%endmacro 178 179%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put) 180%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep) 181 182%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX 183 184BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 185BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 186HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 187HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 188HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 189HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 190SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128 191SCALED_JMP_TABLE prep_8tap_scaled_avx2, 4, 8, 16, 32, 64, 128 192BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128 193BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128 194BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128 195BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128 196BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128 197BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128 198BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32 199BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32 200BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32 201 202SECTION .text 203 204INIT_XMM avx2 205cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy 206 movifnidn mxyd, r6m ; mx 207 lea r7, [put_avx2] 208 tzcnt wd, wm 209 movifnidn hd, hm 210 test mxyd, mxyd 211 jnz .h 212 mov mxyd, r7m ; my 213 test mxyd, mxyd 214 jnz .v 215.put: 216 movzx wd, word [r7+wq*2+table_offset(put,)] 217 add wq, r7 218 jmp wq 219.put_w2: 220 movzx r6d, word [srcq+ssq*0] 221 movzx r7d, word [srcq+ssq*1] 222 lea srcq, [srcq+ssq*2] 223 mov [dstq+dsq*0], r6w 224 mov [dstq+dsq*1], r7w 225 lea dstq, [dstq+dsq*2] 226 sub hd, 2 227 jg .put_w2 228 RET 229.put_w4: 230 mov r6d, [srcq+ssq*0] 231 mov r7d, [srcq+ssq*1] 232 lea srcq, [srcq+ssq*2] 233 mov [dstq+dsq*0], r6d 234 mov [dstq+dsq*1], r7d 235 lea dstq, [dstq+dsq*2] 236 sub hd, 2 237 jg .put_w4 238 RET 239.put_w8: 240 mov r6, [srcq+ssq*0] 241 mov r7, [srcq+ssq*1] 242 lea srcq, [srcq+ssq*2] 243 mov [dstq+dsq*0], r6 244 mov [dstq+dsq*1], r7 245 lea dstq, [dstq+dsq*2] 246 sub hd, 2 247 jg .put_w8 248 RET 249.put_w16: 250 movu m0, [srcq+ssq*0] 251 movu m1, [srcq+ssq*1] 252 lea srcq, [srcq+ssq*2] 253 mova [dstq+dsq*0], m0 254 mova [dstq+dsq*1], m1 255 lea dstq, [dstq+dsq*2] 256 sub hd, 2 257 jg .put_w16 258 RET 259INIT_YMM avx2 260.put_w32: 261 movu m0, [srcq+ssq*0] 262 movu m1, [srcq+ssq*1] 263 lea srcq, [srcq+ssq*2] 264 mova [dstq+dsq*0], m0 265 mova [dstq+dsq*1], m1 266 lea dstq, [dstq+dsq*2] 267 sub hd, 2 268 jg .put_w32 269 RET 270.put_w64: 271 movu m0, [srcq+ssq*0+32*0] 272 movu m1, [srcq+ssq*0+32*1] 273 movu m2, [srcq+ssq*1+32*0] 274 movu m3, [srcq+ssq*1+32*1] 275 lea srcq, [srcq+ssq*2] 276 mova [dstq+dsq*0+32*0], m0 277 mova [dstq+dsq*0+32*1], m1 278 mova [dstq+dsq*1+32*0], m2 279 mova [dstq+dsq*1+32*1], m3 280 lea dstq, [dstq+dsq*2] 281 sub hd, 2 282 jg .put_w64 283 RET 284.put_w128: 285 movu m0, [srcq+32*0] 286 movu m1, [srcq+32*1] 287 movu m2, [srcq+32*2] 288 movu m3, [srcq+32*3] 289 add srcq, ssq 290 mova [dstq+32*0], m0 291 mova [dstq+32*1], m1 292 mova [dstq+32*2], m2 293 mova [dstq+32*3], m3 294 add dstq, dsq 295 dec hd 296 jg .put_w128 297 RET 298.h: 299 ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 300 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 301 imul mxyd, 255 302 vbroadcasti128 m4, [bilin_h_shuf8] 303 add mxyd, 16 304 movd xm5, mxyd 305 mov mxyd, r7m ; my 306 vpbroadcastw m5, xm5 307 test mxyd, mxyd 308 jnz .hv 309 movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] 310 vpbroadcastd m3, [pw_2048] 311 add wq, r7 312 jmp wq 313.h_w2: 314 movd xm0, [srcq+ssq*0] 315 pinsrd xm0, [srcq+ssq*1], 1 316 lea srcq, [srcq+ssq*2] 317 pshufb xm0, xm4 318 pmaddubsw xm0, xm5 319 pmulhrsw xm0, xm3 320 packuswb xm0, xm0 321 pextrw [dstq+dsq*0], xm0, 0 322 pextrw [dstq+dsq*1], xm0, 2 323 lea dstq, [dstq+dsq*2] 324 sub hd, 2 325 jg .h_w2 326 RET 327.h_w4: 328 mova xm4, [bilin_h_shuf4] 329.h_w4_loop: 330 movq xm0, [srcq+ssq*0] 331 movhps xm0, [srcq+ssq*1] 332 lea srcq, [srcq+ssq*2] 333 pshufb xm0, xm4 334 pmaddubsw xm0, xm5 335 pmulhrsw xm0, xm3 336 packuswb xm0, xm0 337 movd [dstq+dsq*0], xm0 338 pextrd [dstq+dsq*1], xm0, 1 339 lea dstq, [dstq+dsq*2] 340 sub hd, 2 341 jg .h_w4_loop 342 RET 343.h_w8: 344 movu xm0, [srcq+ssq*0] 345 movu xm1, [srcq+ssq*1] 346 lea srcq, [srcq+ssq*2] 347 pshufb xm0, xm4 348 pshufb xm1, xm4 349 pmaddubsw xm0, xm5 350 pmaddubsw xm1, xm5 351 pmulhrsw xm0, xm3 352 pmulhrsw xm1, xm3 353 packuswb xm0, xm1 354 movq [dstq+dsq*0], xm0 355 movhps [dstq+dsq*1], xm0 356 lea dstq, [dstq+dsq*2] 357 sub hd, 2 358 jg .h_w8 359 RET 360.h_w16: 361 movu xm0, [srcq+ssq*0+8*0] 362 vinserti128 m0, [srcq+ssq*1+8*0], 1 363 movu xm1, [srcq+ssq*0+8*1] 364 vinserti128 m1, [srcq+ssq*1+8*1], 1 365 lea srcq, [srcq+ssq*2] 366 pshufb m0, m4 367 pshufb m1, m4 368 pmaddubsw m0, m5 369 pmaddubsw m1, m5 370 pmulhrsw m0, m3 371 pmulhrsw m1, m3 372 packuswb m0, m1 373 mova [dstq+dsq*0], xm0 374 vextracti128 [dstq+dsq*1], m0, 1 375 lea dstq, [dstq+dsq*2] 376 sub hd, 2 377 jg .h_w16 378 RET 379.h_w32: 380 movu m0, [srcq+8*0] 381 movu m1, [srcq+8*1] 382 add srcq, ssq 383 pshufb m0, m4 384 pshufb m1, m4 385 pmaddubsw m0, m5 386 pmaddubsw m1, m5 387 pmulhrsw m0, m3 388 pmulhrsw m1, m3 389 packuswb m0, m1 390 mova [dstq], m0 391 add dstq, dsq 392 dec hd 393 jg .h_w32 394 RET 395.h_w64: 396 movu m0, [srcq+8*0] 397 movu m1, [srcq+8*1] 398 pshufb m0, m4 399 pshufb m1, m4 400 pmaddubsw m0, m5 401 pmaddubsw m1, m5 402 pmulhrsw m0, m3 403 pmulhrsw m1, m3 404 packuswb m0, m1 405 movu m1, [srcq+8*4] 406 movu m2, [srcq+8*5] 407 add srcq, ssq 408 pshufb m1, m4 409 pshufb m2, m4 410 pmaddubsw m1, m5 411 pmaddubsw m2, m5 412 pmulhrsw m1, m3 413 pmulhrsw m2, m3 414 packuswb m1, m2 415 mova [dstq+32*0], m0 416 mova [dstq+32*1], m1 417 add dstq, dsq 418 dec hd 419 jg .h_w64 420 RET 421.h_w128: 422 mov r6, -32*3 423.h_w128_loop: 424 movu m0, [srcq+r6+32*3+8*0] 425 movu m1, [srcq+r6+32*3+8*1] 426 pshufb m0, m4 427 pshufb m1, m4 428 pmaddubsw m0, m5 429 pmaddubsw m1, m5 430 pmulhrsw m0, m3 431 pmulhrsw m1, m3 432 packuswb m0, m1 433 mova [dstq+r6+32*3], m0 434 add r6, 32 435 jle .h_w128_loop 436 add srcq, ssq 437 add dstq, dsq 438 dec hd 439 jg .h_w128 440 RET 441.v: 442 movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] 443 imul mxyd, 255 444 vpbroadcastd m5, [pw_2048] 445 add mxyd, 16 446 add wq, r7 447 movd xm4, mxyd 448 vpbroadcastw m4, xm4 449 jmp wq 450.v_w2: 451 movd xm0, [srcq+ssq*0] 452.v_w2_loop: 453 pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1 454 lea srcq, [srcq+ssq*2] 455 pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1 456 pshuflw xm1, xm1, q2301 ; 1 0 457 punpcklbw xm1, xm0 458 pmaddubsw xm1, xm4 459 pmulhrsw xm1, xm5 460 packuswb xm1, xm1 461 pextrw [dstq+dsq*0], xm1, 1 462 pextrw [dstq+dsq*1], xm1, 0 463 lea dstq, [dstq+dsq*2] 464 sub hd, 2 465 jg .v_w2_loop 466 RET 467.v_w4: 468 movd xm0, [srcq+ssq*0] 469.v_w4_loop: 470 vpbroadcastd xm2, [srcq+ssq*1] 471 lea srcq, [srcq+ssq*2] 472 vpblendd xm1, xm2, xm0, 0x01 ; 0 1 473 vpbroadcastd xm0, [srcq+ssq*0] 474 vpblendd xm2, xm0, 0x02 ; 1 2 475 punpcklbw xm1, xm2 476 pmaddubsw xm1, xm4 477 pmulhrsw xm1, xm5 478 packuswb xm1, xm1 479 movd [dstq+dsq*0], xm1 480 pextrd [dstq+dsq*1], xm1, 1 481 lea dstq, [dstq+dsq*2] 482 sub hd, 2 483 jg .v_w4_loop 484 RET 485.v_w8: 486 movq xm0, [srcq+ssq*0] 487.v_w8_loop: 488 movq xm2, [srcq+ssq*1] 489 lea srcq, [srcq+ssq*2] 490 punpcklbw xm1, xm0, xm2 491 movq xm0, [srcq+ssq*0] 492 punpcklbw xm2, xm0 493 pmaddubsw xm1, xm4 494 pmaddubsw xm2, xm4 495 pmulhrsw xm1, xm5 496 pmulhrsw xm2, xm5 497 packuswb xm1, xm2 498 movq [dstq+dsq*0], xm1 499 movhps [dstq+dsq*1], xm1 500 lea dstq, [dstq+dsq*2] 501 sub hd, 2 502 jg .v_w8_loop 503 RET 504.v_w16: 505 movu xm0, [srcq+ssq*0] 506.v_w16_loop: 507 vbroadcasti128 m3, [srcq+ssq*1] 508 lea srcq, [srcq+ssq*2] 509 vpblendd m2, m3, m0, 0x0f ; 0 1 510 vbroadcasti128 m0, [srcq+ssq*0] 511 vpblendd m3, m0, 0xf0 ; 1 2 512 punpcklbw m1, m2, m3 513 punpckhbw m2, m3 514 pmaddubsw m1, m4 515 pmaddubsw m2, m4 516 pmulhrsw m1, m5 517 pmulhrsw m2, m5 518 packuswb m1, m2 519 mova [dstq+dsq*0], xm1 520 vextracti128 [dstq+dsq*1], m1, 1 521 lea dstq, [dstq+dsq*2] 522 sub hd, 2 523 jg .v_w16_loop 524 RET 525.v_w32: 526%macro PUT_BILIN_V_W32 0 527 movu m0, [srcq+ssq*0] 528%%loop: 529 movu m3, [srcq+ssq*1] 530 lea srcq, [srcq+ssq*2] 531 punpcklbw m1, m0, m3 532 punpckhbw m2, m0, m3 533 movu m0, [srcq+ssq*0] 534 pmaddubsw m1, m4 535 pmaddubsw m2, m4 536 pmulhrsw m1, m5 537 pmulhrsw m2, m5 538 packuswb m1, m2 539 punpcklbw m2, m3, m0 540 punpckhbw m3, m0 541 pmaddubsw m2, m4 542 pmaddubsw m3, m4 543 pmulhrsw m2, m5 544 pmulhrsw m3, m5 545 packuswb m2, m3 546 mova [dstq+dsq*0], m1 547 mova [dstq+dsq*1], m2 548 lea dstq, [dstq+dsq*2] 549 sub hd, 2 550 jg %%loop 551%endmacro 552 PUT_BILIN_V_W32 553 RET 554.v_w64: 555 movu m0, [srcq+32*0] 556 movu m1, [srcq+32*1] 557.v_w64_loop: 558 add srcq, ssq 559 movu m3, [srcq+32*0] 560 punpcklbw m2, m0, m3 561 punpckhbw m0, m3 562 pmaddubsw m2, m4 563 pmaddubsw m0, m4 564 pmulhrsw m2, m5 565 pmulhrsw m0, m5 566 packuswb m2, m0 567 mova m0, m3 568 movu m3, [srcq+32*1] 569 mova [dstq+32*0], m2 570 punpcklbw m2, m1, m3 571 punpckhbw m1, m3 572 pmaddubsw m2, m4 573 pmaddubsw m1, m4 574 pmulhrsw m2, m5 575 pmulhrsw m1, m5 576 packuswb m2, m1 577 mova m1, m3 578 mova [dstq+32*1], m2 579 add dstq, dsq 580 dec hd 581 jg .v_w64_loop 582 RET 583.v_w128: 584 lea r6d, [hq+(3<<8)] 585 mov r4, srcq 586 mov r7, dstq 587.v_w128_loop: 588 PUT_BILIN_V_W32 589 add r4, 32 590 add r7, 32 591 movzx hd, r6b 592 mov srcq, r4 593 mov dstq, r7 594 sub r6d, 1<<8 595 jg .v_w128_loop 596 RET 597.hv: 598 ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 599 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 600 movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] 601 WIN64_SPILL_XMM 8 602 shl mxyd, 11 ; can't shift by 12 due to signed overflow 603 vpbroadcastd m7, [pw_15] 604 movd xm6, mxyd 605 add wq, r7 606 paddb m5, m5 607 vpbroadcastw m6, xm6 608 jmp wq 609.hv_w2: 610 vpbroadcastd xm0, [srcq+ssq*0] 611 pshufb xm0, xm4 612 pmaddubsw xm0, xm5 613.hv_w2_loop: 614 movd xm1, [srcq+ssq*1] 615 lea srcq, [srcq+ssq*2] 616 pinsrd xm1, [srcq+ssq*0], 1 617 pshufb xm1, xm4 618 pmaddubsw xm1, xm5 ; 1 _ 2 _ 619 shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _ 620 mova xm0, xm1 621 psubw xm1, xm2 622 pmulhw xm1, xm6 623 pavgw xm2, xm7 624 paddw xm1, xm2 625 psrlw xm1, 4 626 packuswb xm1, xm1 627 pextrw [dstq+dsq*0], xm1, 0 628 pextrw [dstq+dsq*1], xm1, 2 629 lea dstq, [dstq+dsq*2] 630 sub hd, 2 631 jg .hv_w2_loop 632 RET 633.hv_w4: 634 mova xm4, [bilin_h_shuf4] 635 movddup xm0, [srcq+ssq*0] 636 pshufb xm0, xm4 637 pmaddubsw xm0, xm5 638.hv_w4_loop: 639 movq xm1, [srcq+ssq*1] 640 lea srcq, [srcq+ssq*2] 641 movhps xm1, [srcq+ssq*0] 642 pshufb xm1, xm4 643 pmaddubsw xm1, xm5 ; 1 2 644 shufps xm2, xm0, xm1, q1032 ; 0 1 645 mova xm0, xm1 646 psubw xm1, xm2 647 pmulhw xm1, xm6 648 pavgw xm2, xm7 649 paddw xm1, xm2 650 psrlw xm1, 4 651 packuswb xm1, xm1 652 movd [dstq+dsq*0], xm1 653 pextrd [dstq+dsq*1], xm1, 1 654 lea dstq, [dstq+dsq*2] 655 sub hd, 2 656 jg .hv_w4_loop 657 RET 658.hv_w8: 659 vbroadcasti128 m0, [srcq+ssq*0] 660 pshufb m0, m4 661 pmaddubsw m0, m5 662.hv_w8_loop: 663 movu xm1, [srcq+ssq*1] 664 lea srcq, [srcq+ssq*2] 665 vinserti128 m1, [srcq+ssq*0], 1 666 pshufb m1, m4 667 pmaddubsw m1, m5 ; 1 2 668 vperm2i128 m2, m0, m1, 0x21 ; 0 1 669 mova m0, m1 670 psubw m1, m2 671 pmulhw m1, m6 672 pavgw m2, m7 673 paddw m1, m2 674 psrlw m1, 4 675 vextracti128 xm2, m1, 1 676 packuswb xm1, xm2 677 movq [dstq+dsq*0], xm1 678 movhps [dstq+dsq*1], xm1 679 lea dstq, [dstq+dsq*2] 680 sub hd, 2 681 jg .hv_w8_loop 682 RET 683.hv_w16: 684 movu m0, [srcq+ssq*0+8*0] 685 vinserti128 m0, [srcq+ssq*0+8*1], 1 686 pshufb m0, m4 687 pmaddubsw m0, m5 688.hv_w16_loop: 689 movu xm2, [srcq+ssq*1+8*0] 690 vinserti128 m2, [srcq+ssq*1+8*1], 1 691 lea srcq, [srcq+ssq*2] 692 movu xm3, [srcq+ssq*0+8*0] 693 vinserti128 m3, [srcq+ssq*0+8*1], 1 694 pshufb m2, m4 695 pshufb m3, m4 696 pmaddubsw m2, m5 697 psubw m1, m2, m0 698 pmulhw m1, m6 699 pavgw m0, m7 700 paddw m1, m0 701 pmaddubsw m0, m3, m5 702 psubw m3, m0, m2 703 pmulhw m3, m6 704 pavgw m2, m7 705 paddw m3, m2 706 psrlw m1, 4 707 psrlw m3, 4 708 packuswb m1, m3 709 vpermq m1, m1, q3120 710 mova [dstq+dsq*0], xm1 711 vextracti128 [dstq+dsq*1], m1, 1 712 lea dstq, [dstq+dsq*2] 713 sub hd, 2 714 jg .hv_w16_loop 715 RET 716.hv_w128: 717 lea r6d, [hq+(3<<16)] 718 jmp .hv_w32_start 719.hv_w64: 720 lea r6d, [hq+(1<<16)] 721.hv_w32_start: 722 mov r4, srcq 723 mov r7, dstq 724.hv_w32: 725%if WIN64 726 movaps r4m, xmm8 727%endif 728.hv_w32_loop0: 729 movu m0, [srcq+8*0] 730 movu m1, [srcq+8*1] 731 pshufb m0, m4 732 pshufb m1, m4 733 pmaddubsw m0, m5 734 pmaddubsw m1, m5 735.hv_w32_loop: 736 add srcq, ssq 737 movu m2, [srcq+8*0] 738 movu m3, [srcq+8*1] 739 pshufb m2, m4 740 pshufb m3, m4 741 pmaddubsw m2, m5 742 pmaddubsw m3, m5 743 psubw m8, m2, m0 744 pmulhw m8, m6 745 pavgw m0, m7 746 paddw m8, m0 747 mova m0, m2 748 psubw m2, m3, m1 749 pmulhw m2, m6 750 pavgw m1, m7 751 paddw m2, m1 752 mova m1, m3 753 psrlw m8, 4 754 psrlw m2, 4 755 packuswb m8, m2 756 mova [dstq], m8 757 add dstq, dsq 758 dec hd 759 jg .hv_w32_loop 760 add r4, 32 761 add r7, 32 762 movzx hd, r6b 763 mov srcq, r4 764 mov dstq, r7 765 sub r6d, 1<<16 766 jg .hv_w32_loop0 767%if WIN64 768 movaps xmm8, r4m 769%endif 770 RET 771 772cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 773 movifnidn mxyd, r5m ; mx 774 lea r6, [prep%+SUFFIX] 775 tzcnt wd, wm 776 movifnidn hd, hm 777 test mxyd, mxyd 778 jnz .h 779 mov mxyd, r6m ; my 780 test mxyd, mxyd 781 jnz .v 782.prep: 783 movzx wd, word [r6+wq*2+table_offset(prep,)] 784 add wq, r6 785 lea stride3q, [strideq*3] 786 jmp wq 787.prep_w4: 788 movd xm0, [srcq+strideq*0] 789 pinsrd xm0, [srcq+strideq*1], 1 790 pinsrd xm0, [srcq+strideq*2], 2 791 pinsrd xm0, [srcq+stride3q ], 3 792 lea srcq, [srcq+strideq*4] 793 pmovzxbw m0, xm0 794 psllw m0, 4 795 mova [tmpq], m0 796 add tmpq, 32 797 sub hd, 4 798 jg .prep_w4 799 RET 800.prep_w8: 801 movq xm0, [srcq+strideq*0] 802 movhps xm0, [srcq+strideq*1] 803 movq xm1, [srcq+strideq*2] 804 movhps xm1, [srcq+stride3q ] 805 lea srcq, [srcq+strideq*4] 806 pmovzxbw m0, xm0 807 pmovzxbw m1, xm1 808 psllw m0, 4 809 psllw m1, 4 810 mova [tmpq+32*0], m0 811 mova [tmpq+32*1], m1 812 add tmpq, 32*2 813 sub hd, 4 814 jg .prep_w8 815 RET 816.prep_w16: 817 pmovzxbw m0, [srcq+strideq*0] 818 pmovzxbw m1, [srcq+strideq*1] 819 pmovzxbw m2, [srcq+strideq*2] 820 pmovzxbw m3, [srcq+stride3q ] 821 lea srcq, [srcq+strideq*4] 822 psllw m0, 4 823 psllw m1, 4 824 psllw m2, 4 825 psllw m3, 4 826 mova [tmpq+32*0], m0 827 mova [tmpq+32*1], m1 828 mova [tmpq+32*2], m2 829 mova [tmpq+32*3], m3 830 add tmpq, 32*4 831 sub hd, 4 832 jg .prep_w16 833 RET 834.prep_w32: 835 pmovzxbw m0, [srcq+strideq*0+16*0] 836 pmovzxbw m1, [srcq+strideq*0+16*1] 837 pmovzxbw m2, [srcq+strideq*1+16*0] 838 pmovzxbw m3, [srcq+strideq*1+16*1] 839 lea srcq, [srcq+strideq*2] 840 psllw m0, 4 841 psllw m1, 4 842 psllw m2, 4 843 psllw m3, 4 844 mova [tmpq+32*0], m0 845 mova [tmpq+32*1], m1 846 mova [tmpq+32*2], m2 847 mova [tmpq+32*3], m3 848 add tmpq, 32*4 849 sub hd, 2 850 jg .prep_w32 851 RET 852.prep_w64: 853 pmovzxbw m0, [srcq+16*0] 854 pmovzxbw m1, [srcq+16*1] 855 pmovzxbw m2, [srcq+16*2] 856 pmovzxbw m3, [srcq+16*3] 857 add srcq, strideq 858 psllw m0, 4 859 psllw m1, 4 860 psllw m2, 4 861 psllw m3, 4 862 mova [tmpq+32*0], m0 863 mova [tmpq+32*1], m1 864 mova [tmpq+32*2], m2 865 mova [tmpq+32*3], m3 866 add tmpq, 32*4 867 dec hd 868 jg .prep_w64 869 RET 870.prep_w128: 871 pmovzxbw m0, [srcq+16*0] 872 pmovzxbw m1, [srcq+16*1] 873 pmovzxbw m2, [srcq+16*2] 874 pmovzxbw m3, [srcq+16*3] 875 psllw m0, 4 876 psllw m1, 4 877 psllw m2, 4 878 psllw m3, 4 879 mova [tmpq+32*0], m0 880 mova [tmpq+32*1], m1 881 mova [tmpq+32*2], m2 882 mova [tmpq+32*3], m3 883 pmovzxbw m0, [srcq+16*4] 884 pmovzxbw m1, [srcq+16*5] 885 pmovzxbw m2, [srcq+16*6] 886 pmovzxbw m3, [srcq+16*7] 887 add tmpq, 32*8 888 add srcq, strideq 889 psllw m0, 4 890 psllw m1, 4 891 psllw m2, 4 892 psllw m3, 4 893 mova [tmpq-32*4], m0 894 mova [tmpq-32*3], m1 895 mova [tmpq-32*2], m2 896 mova [tmpq-32*1], m3 897 dec hd 898 jg .prep_w128 899 RET 900.h: 901 ; 16 * src[x] + (mx * (src[x + 1] - src[x])) 902 ; = (16 - mx) * src[x] + mx * src[x + 1] 903 imul mxyd, 255 904 vbroadcasti128 m4, [bilin_h_shuf8] 905 add mxyd, 16 906 movd xm5, mxyd 907 mov mxyd, r6m ; my 908 vpbroadcastw m5, xm5 909 test mxyd, mxyd 910 jnz .hv 911 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] 912 add wq, r6 913 lea stride3q, [strideq*3] 914 jmp wq 915.h_w4: 916 vbroadcasti128 m4, [bilin_h_shuf4] 917.h_w4_loop: 918 movq xm0, [srcq+strideq*0] 919 movhps xm0, [srcq+strideq*1] 920 movq xm1, [srcq+strideq*2] 921 movhps xm1, [srcq+stride3q ] 922 lea srcq, [srcq+strideq*4] 923 vinserti128 m0, xm1, 1 924 pshufb m0, m4 925 pmaddubsw m0, m5 926 mova [tmpq], m0 927 add tmpq, 32 928 sub hd, 4 929 jg .h_w4_loop 930 RET 931.h_w8: 932.h_w8_loop: 933 movu xm0, [srcq+strideq*0] 934 vinserti128 m0, [srcq+strideq*1], 1 935 movu xm1, [srcq+strideq*2] 936 vinserti128 m1, [srcq+stride3q ], 1 937 lea srcq, [srcq+strideq*4] 938 pshufb m0, m4 939 pshufb m1, m4 940 pmaddubsw m0, m5 941 pmaddubsw m1, m5 942 mova [tmpq+32*0], m0 943 mova [tmpq+32*1], m1 944 add tmpq, 32*2 945 sub hd, 4 946 jg .h_w8_loop 947 RET 948.h_w16: 949.h_w16_loop: 950 movu xm0, [srcq+strideq*0+8*0] 951 vinserti128 m0, [srcq+strideq*0+8*1], 1 952 movu xm1, [srcq+strideq*1+8*0] 953 vinserti128 m1, [srcq+strideq*1+8*1], 1 954 movu xm2, [srcq+strideq*2+8*0] 955 vinserti128 m2, [srcq+strideq*2+8*1], 1 956 movu xm3, [srcq+stride3q +8*0] 957 vinserti128 m3, [srcq+stride3q +8*1], 1 958 lea srcq, [srcq+strideq*4] 959 pshufb m0, m4 960 pshufb m1, m4 961 pshufb m2, m4 962 pshufb m3, m4 963 pmaddubsw m0, m5 964 pmaddubsw m1, m5 965 pmaddubsw m2, m5 966 pmaddubsw m3, m5 967 mova [tmpq+32*0], m0 968 mova [tmpq+32*1], m1 969 mova [tmpq+32*2], m2 970 mova [tmpq+32*3], m3 971 add tmpq, 32*4 972 sub hd, 4 973 jg .h_w16_loop 974 RET 975.h_w32: 976.h_w32_loop: 977 movu xm0, [srcq+strideq*0+8*0] 978 vinserti128 m0, [srcq+strideq*0+8*1], 1 979 movu xm1, [srcq+strideq*0+8*2] 980 vinserti128 m1, [srcq+strideq*0+8*3], 1 981 movu xm2, [srcq+strideq*1+8*0] 982 vinserti128 m2, [srcq+strideq*1+8*1], 1 983 movu xm3, [srcq+strideq*1+8*2] 984 vinserti128 m3, [srcq+strideq*1+8*3], 1 985 lea srcq, [srcq+strideq*2] 986 pshufb m0, m4 987 pshufb m1, m4 988 pshufb m2, m4 989 pshufb m3, m4 990 pmaddubsw m0, m5 991 pmaddubsw m1, m5 992 pmaddubsw m2, m5 993 pmaddubsw m3, m5 994 mova [tmpq+32*0], m0 995 mova [tmpq+32*1], m1 996 mova [tmpq+32*2], m2 997 mova [tmpq+32*3], m3 998 add tmpq, 32*4 999 sub hd, 2 1000 jg .h_w32_loop 1001 RET 1002.h_w64: 1003 movu xm0, [srcq+8*0] 1004 vinserti128 m0, [srcq+8*1], 1 1005 movu xm1, [srcq+8*2] 1006 vinserti128 m1, [srcq+8*3], 1 1007 movu xm2, [srcq+8*4] 1008 vinserti128 m2, [srcq+8*5], 1 1009 movu xm3, [srcq+8*6] 1010 vinserti128 m3, [srcq+8*7], 1 1011 add srcq, strideq 1012 pshufb m0, m4 1013 pshufb m1, m4 1014 pshufb m2, m4 1015 pshufb m3, m4 1016 pmaddubsw m0, m5 1017 pmaddubsw m1, m5 1018 pmaddubsw m2, m5 1019 pmaddubsw m3, m5 1020 mova [tmpq+32*0], m0 1021 mova [tmpq+32*1], m1 1022 mova [tmpq+32*2], m2 1023 mova [tmpq+32*3], m3 1024 add tmpq, 32*4 1025 dec hd 1026 jg .h_w64 1027 RET 1028.h_w128: 1029 movu xm0, [srcq+8*0] 1030 vinserti128 m0, [srcq+8*1], 1 1031 movu xm1, [srcq+8*2] 1032 vinserti128 m1, [srcq+8*3], 1 1033 movu xm2, [srcq+8*4] 1034 vinserti128 m2, [srcq+8*5], 1 1035 movu xm3, [srcq+8*6] 1036 vinserti128 m3, [srcq+8*7], 1 1037 pshufb m0, m4 1038 pshufb m1, m4 1039 pshufb m2, m4 1040 pshufb m3, m4 1041 pmaddubsw m0, m5 1042 pmaddubsw m1, m5 1043 pmaddubsw m2, m5 1044 pmaddubsw m3, m5 1045 mova [tmpq+32*0], m0 1046 mova [tmpq+32*1], m1 1047 mova [tmpq+32*2], m2 1048 mova [tmpq+32*3], m3 1049 movu xm0, [srcq+8* 8] 1050 vinserti128 m0, [srcq+8* 9], 1 1051 movu xm1, [srcq+8*10] 1052 vinserti128 m1, [srcq+8*11], 1 1053 movu xm2, [srcq+8*12] 1054 vinserti128 m2, [srcq+8*13], 1 1055 movu xm3, [srcq+8*14] 1056 vinserti128 m3, [srcq+8*15], 1 1057 add tmpq, 32*8 1058 add srcq, strideq 1059 pshufb m0, m4 1060 pshufb m1, m4 1061 pshufb m2, m4 1062 pshufb m3, m4 1063 pmaddubsw m0, m5 1064 pmaddubsw m1, m5 1065 pmaddubsw m2, m5 1066 pmaddubsw m3, m5 1067 mova [tmpq-32*4], m0 1068 mova [tmpq-32*3], m1 1069 mova [tmpq-32*2], m2 1070 mova [tmpq-32*1], m3 1071 dec hd 1072 jg .h_w128 1073 RET 1074.v: 1075 WIN64_SPILL_XMM 7 1076 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] 1077 imul mxyd, 255 1078 add mxyd, 16 1079 add wq, r6 1080 lea stride3q, [strideq*3] 1081 movd xm6, mxyd 1082 vpbroadcastw m6, xm6 1083 jmp wq 1084.v_w4: 1085 movd xm0, [srcq+strideq*0] 1086.v_w4_loop: 1087 vpbroadcastd m1, [srcq+strideq*2] 1088 vpbroadcastd xm2, [srcq+strideq*1] 1089 vpbroadcastd m3, [srcq+stride3q ] 1090 lea srcq, [srcq+strideq*4] 1091 vpblendd m1, m0, 0x05 ; 0 2 2 2 1092 vpbroadcastd m0, [srcq+strideq*0] 1093 vpblendd m3, m2, 0x0f ; 1 1 3 3 1094 vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4 1095 vpblendd m1, m3, 0xaa ; 0 1 2 3 1096 vpblendd m2, m3, 0x55 ; 1 2 3 4 1097 punpcklbw m1, m2 1098 pmaddubsw m1, m6 1099 mova [tmpq], m1 1100 add tmpq, 32 1101 sub hd, 4 1102 jg .v_w4_loop 1103 RET 1104.v_w8: 1105 movq xm0, [srcq+strideq*0] 1106.v_w8_loop: 1107 vpbroadcastq m1, [srcq+strideq*2] 1108 vpbroadcastq m2, [srcq+strideq*1] 1109 vpbroadcastq m3, [srcq+stride3q ] 1110 lea srcq, [srcq+strideq*4] 1111 vpblendd m1, m0, 0x03 ; 0 2 2 2 1112 vpbroadcastq m0, [srcq+strideq*0] 1113 vpblendd m2, m3, 0xcc ; 1 3 1 3 1114 vpblendd m3, m2, m1, 0xf0 ; 1 3 2 2 1115 vpblendd m2, m1, 0x0f ; 0 2 1 3 1116 vpblendd m3, m0, 0xc0 ; 1 3 2 4 1117 punpcklbw m1, m2, m3 1118 punpckhbw m2, m3 1119 pmaddubsw m1, m6 1120 pmaddubsw m2, m6 1121 mova [tmpq+32*0], m1 1122 mova [tmpq+32*1], m2 1123 add tmpq, 32*2 1124 sub hd, 4 1125 jg .v_w8_loop 1126 RET 1127.v_w16: 1128 vbroadcasti128 m0, [srcq+strideq*0] 1129.v_w16_loop: 1130 vbroadcasti128 m1, [srcq+strideq*1] 1131 vbroadcasti128 m2, [srcq+strideq*2] 1132 vbroadcasti128 m3, [srcq+stride3q ] 1133 lea srcq, [srcq+strideq*4] 1134 shufpd m4, m0, m2, 0x0c ; 0 2 1135 vbroadcasti128 m0, [srcq+strideq*0] 1136 shufpd m1, m3, 0x0c ; 1 3 1137 shufpd m2, m0, 0x0c ; 2 4 1138 punpcklbw m3, m4, m1 1139 punpcklbw m5, m1, m2 1140 punpckhbw m4, m1 1141 punpckhbw m1, m2 1142 pmaddubsw m3, m6 1143 pmaddubsw m5, m6 1144 pmaddubsw m4, m6 1145 pmaddubsw m1, m6 1146 mova [tmpq+32*0], m3 1147 mova [tmpq+32*1], m5 1148 mova [tmpq+32*2], m4 1149 mova [tmpq+32*3], m1 1150 add tmpq, 32*4 1151 sub hd, 4 1152 jg .v_w16_loop 1153 RET 1154.v_w32: 1155 vpermq m0, [srcq+strideq*0], q3120 1156.v_w32_loop: 1157 vpermq m1, [srcq+strideq*1], q3120 1158 vpermq m2, [srcq+strideq*2], q3120 1159 vpermq m3, [srcq+stride3q ], q3120 1160 lea srcq, [srcq+strideq*4] 1161 punpcklbw m4, m0, m1 1162 punpckhbw m5, m0, m1 1163 vpermq m0, [srcq+strideq*0], q3120 1164 pmaddubsw m4, m6 1165 pmaddubsw m5, m6 1166 mova [tmpq+32*0], m4 1167 mova [tmpq+32*1], m5 1168 punpcklbw m4, m1, m2 1169 punpckhbw m1, m2 1170 pmaddubsw m4, m6 1171 pmaddubsw m1, m6 1172 punpcklbw m5, m2, m3 1173 punpckhbw m2, m3 1174 pmaddubsw m5, m6 1175 pmaddubsw m2, m6 1176 mova [tmpq+32*2], m4 1177 mova [tmpq+32*3], m1 1178 add tmpq, 32*8 1179 punpcklbw m1, m3, m0 1180 punpckhbw m3, m0 1181 pmaddubsw m1, m6 1182 pmaddubsw m3, m6 1183 mova [tmpq-32*4], m5 1184 mova [tmpq-32*3], m2 1185 mova [tmpq-32*2], m1 1186 mova [tmpq-32*1], m3 1187 sub hd, 4 1188 jg .v_w32_loop 1189 RET 1190.v_w64: 1191 vpermq m0, [srcq+strideq*0+32*0], q3120 1192 vpermq m1, [srcq+strideq*0+32*1], q3120 1193.v_w64_loop: 1194 vpermq m2, [srcq+strideq*1+32*0], q3120 1195 vpermq m3, [srcq+strideq*1+32*1], q3120 1196 lea srcq, [srcq+strideq*2] 1197 punpcklbw m4, m0, m2 1198 punpckhbw m0, m2 1199 pmaddubsw m4, m6 1200 pmaddubsw m0, m6 1201 mova [tmpq+32*0], m4 1202 mova [tmpq+32*1], m0 1203 punpcklbw m4, m1, m3 1204 punpckhbw m5, m1, m3 1205 vpermq m0, [srcq+strideq*0+32*0], q3120 1206 vpermq m1, [srcq+strideq*0+32*1], q3120 1207 pmaddubsw m4, m6 1208 pmaddubsw m5, m6 1209 mova [tmpq+32*2], m4 1210 mova [tmpq+32*3], m5 1211 add tmpq, 32*8 1212 punpcklbw m4, m2, m0 1213 punpckhbw m2, m0 1214 punpcklbw m5, m3, m1 1215 punpckhbw m3, m1 1216 pmaddubsw m4, m6 1217 pmaddubsw m2, m6 1218 pmaddubsw m5, m6 1219 pmaddubsw m3, m6 1220 mova [tmpq-32*4], m4 1221 mova [tmpq-32*3], m2 1222 mova [tmpq-32*2], m5 1223 mova [tmpq-32*1], m3 1224 sub hd, 2 1225 jg .v_w64_loop 1226 RET 1227.v_w128: 1228 lea r6d, [hq+(3<<8)] 1229 mov r3, srcq 1230 mov r5, tmpq 1231.v_w128_loop0: 1232 vpermq m0, [srcq+strideq*0], q3120 1233.v_w128_loop: 1234 vpermq m1, [srcq+strideq*1], q3120 1235 lea srcq, [srcq+strideq*2] 1236 punpcklbw m2, m0, m1 1237 punpckhbw m3, m0, m1 1238 vpermq m0, [srcq+strideq*0], q3120 1239 pmaddubsw m2, m6 1240 pmaddubsw m3, m6 1241 punpcklbw m4, m1, m0 1242 punpckhbw m1, m0 1243 pmaddubsw m4, m6 1244 pmaddubsw m1, m6 1245 mova [tmpq+32*0], m2 1246 mova [tmpq+32*1], m3 1247 mova [tmpq+32*8], m4 1248 mova [tmpq+32*9], m1 1249 add tmpq, 32*16 1250 sub hd, 2 1251 jg .v_w128_loop 1252 add r3, 32 1253 add r5, 64 1254 movzx hd, r6b 1255 mov srcq, r3 1256 mov tmpq, r5 1257 sub r6d, 1<<8 1258 jg .v_w128_loop0 1259 RET 1260.hv: 1261 ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 1262 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) 1263 %assign stack_offset stack_offset - stack_size_padded 1264 WIN64_SPILL_XMM 7 1265 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] 1266 shl mxyd, 11 1267 movd xm6, mxyd 1268 vpbroadcastw m6, xm6 1269 add wq, r6 1270 lea stride3q, [strideq*3] 1271 jmp wq 1272.hv_w4: 1273 vbroadcasti128 m4, [bilin_h_shuf4] 1274 vpbroadcastq m0, [srcq+strideq*0] 1275 pshufb m0, m4 1276 pmaddubsw m0, m5 1277.hv_w4_loop: 1278 movq xm1, [srcq+strideq*1] 1279 movhps xm1, [srcq+strideq*2] 1280 movq xm2, [srcq+stride3q ] 1281 lea srcq, [srcq+strideq*4] 1282 movhps xm2, [srcq+strideq*0] 1283 vinserti128 m1, xm2, 1 1284 pshufb m1, m4 1285 pmaddubsw m1, m5 ; 1 2 3 4 1286 vpblendd m2, m1, m0, 0xc0 1287 vpermq m2, m2, q2103 ; 0 1 2 3 1288 mova m0, m1 1289 psubw m1, m2 1290 pmulhrsw m1, m6 1291 paddw m1, m2 1292 mova [tmpq], m1 1293 add tmpq, 32 1294 sub hd, 4 1295 jg .hv_w4_loop 1296 RET 1297.hv_w8: 1298 vbroadcasti128 m0, [srcq+strideq*0] 1299 pshufb m0, m4 1300 pmaddubsw m0, m5 1301.hv_w8_loop: 1302 movu xm1, [srcq+strideq*1] 1303 vinserti128 m1, [srcq+strideq*2], 1 1304 movu xm2, [srcq+stride3q ] 1305 lea srcq, [srcq+strideq*4] 1306 vinserti128 m2, [srcq+strideq*0], 1 1307 pshufb m1, m4 1308 pshufb m2, m4 1309 pmaddubsw m1, m5 ; 1 2 1310 vperm2i128 m3, m0, m1, 0x21 ; 0 1 1311 pmaddubsw m0, m2, m5 ; 3 4 1312 vperm2i128 m2, m1, m0, 0x21 ; 2 3 1313 psubw m1, m3 1314 pmulhrsw m1, m6 1315 paddw m1, m3 1316 psubw m3, m0, m2 1317 pmulhrsw m3, m6 1318 paddw m3, m2 1319 mova [tmpq+32*0], m1 1320 mova [tmpq+32*1], m3 1321 add tmpq, 32*2 1322 sub hd, 4 1323 jg .hv_w8_loop 1324 RET 1325.hv_w16: 1326 movu xm0, [srcq+strideq*0+8*0] 1327 vinserti128 m0, [srcq+strideq*0+8*1], 1 1328 pshufb m0, m4 1329 pmaddubsw m0, m5 1330.hv_w16_loop: 1331 movu xm1, [srcq+strideq*1+8*0] 1332 vinserti128 m1, [srcq+strideq*1+8*1], 1 1333 lea srcq, [srcq+strideq*2] 1334 movu xm2, [srcq+strideq*0+8*0] 1335 vinserti128 m2, [srcq+strideq*0+8*1], 1 1336 pshufb m1, m4 1337 pshufb m2, m4 1338 pmaddubsw m1, m5 1339 psubw m3, m1, m0 1340 pmulhrsw m3, m6 1341 paddw m3, m0 1342 pmaddubsw m0, m2, m5 1343 psubw m2, m0, m1 1344 pmulhrsw m2, m6 1345 paddw m2, m1 1346 mova [tmpq+32*0], m3 1347 mova [tmpq+32*1], m2 1348 add tmpq, 32*2 1349 sub hd, 2 1350 jg .hv_w16_loop 1351 RET 1352.hv_w32: 1353 movu xm0, [srcq+8*0] 1354 vinserti128 m0, [srcq+8*1], 1 1355 movu xm1, [srcq+8*2] 1356 vinserti128 m1, [srcq+8*3], 1 1357 pshufb m0, m4 1358 pshufb m1, m4 1359 pmaddubsw m0, m5 1360 pmaddubsw m1, m5 1361.hv_w32_loop: 1362 add srcq, strideq 1363 movu xm2, [srcq+8*0] 1364 vinserti128 m2, [srcq+8*1], 1 1365 pshufb m2, m4 1366 pmaddubsw m2, m5 1367 psubw m3, m2, m0 1368 pmulhrsw m3, m6 1369 paddw m3, m0 1370 mova m0, m2 1371 movu xm2, [srcq+8*2] 1372 vinserti128 m2, [srcq+8*3], 1 1373 pshufb m2, m4 1374 pmaddubsw m2, m5 1375 mova [tmpq+32*0], m3 1376 psubw m3, m2, m1 1377 pmulhrsw m3, m6 1378 paddw m3, m1 1379 mova m1, m2 1380 mova [tmpq+32*1], m3 1381 add tmpq, 32*2 1382 dec hd 1383 jg .hv_w32_loop 1384 RET 1385.hv_w128: 1386 lea r3d, [hq+(7<<8)] 1387 mov r6d, 256 1388 jmp .hv_w64_start 1389.hv_w64: 1390 lea r3d, [hq+(3<<8)] 1391 mov r6d, 128 1392.hv_w64_start: 1393%if WIN64 1394 PUSH r7 1395%endif 1396 mov r5, srcq 1397 mov r7, tmpq 1398.hv_w64_loop0: 1399 movu xm0, [srcq+strideq*0+8*0] 1400 vinserti128 m0, [srcq+strideq*0+8*1], 1 1401 pshufb m0, m4 1402 pmaddubsw m0, m5 1403.hv_w64_loop: 1404 movu xm1, [srcq+strideq*1+8*0] 1405 vinserti128 m1, [srcq+strideq*1+8*1], 1 1406 lea srcq, [srcq+strideq*2] 1407 movu xm2, [srcq+strideq*0+8*0] 1408 vinserti128 m2, [srcq+strideq*0+8*1], 1 1409 pshufb m1, m4 1410 pshufb m2, m4 1411 pmaddubsw m1, m5 1412 psubw m3, m1, m0 1413 pmulhrsw m3, m6 1414 paddw m3, m0 1415 pmaddubsw m0, m2, m5 1416 psubw m2, m0, m1 1417 pmulhrsw m2, m6 1418 paddw m2, m1 1419 mova [tmpq+r6*0], m3 1420 mova [tmpq+r6*1], m2 1421 lea tmpq, [tmpq+r6*2] 1422 sub hd, 2 1423 jg .hv_w64_loop 1424 add r5, 16 1425 add r7, 32 1426 movzx hd, r3b 1427 mov srcq, r5 1428 mov tmpq, r7 1429 sub r3d, 1<<8 1430 jg .hv_w64_loop0 1431%if WIN64 1432 POP r7 1433%endif 1434 RET 1435 1436; int8_t subpel_filters[5][15][8] 1437%assign FILTER_REGULAR (0*15 << 16) | 3*15 1438%assign FILTER_SMOOTH (1*15 << 16) | 4*15 1439%assign FILTER_SHARP (2*15 << 16) | 3*15 1440 1441%macro FN 4 ; fn, type, type_h, type_v 1442cglobal %1_%2 1443 mov t0d, FILTER_%3 1444%ifidn %3, %4 1445 mov t1d, t0d 1446%else 1447 mov t1d, FILTER_%4 1448%endif 1449%ifnidn %2, regular ; skip the jump in the last filter 1450 jmp mangle(private_prefix %+ _%1 %+ SUFFIX) 1451%endif 1452%endmacro 1453 1454%if WIN64 1455DECLARE_REG_TMP 4, 5 1456%else 1457DECLARE_REG_TMP 7, 8 1458%endif 1459 1460%define PUT_8TAP_FN FN put_8tap, 1461 1462PUT_8TAP_FN sharp, SHARP, SHARP 1463PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH 1464PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP 1465PUT_8TAP_FN smooth, SMOOTH, SMOOTH 1466PUT_8TAP_FN sharp_regular, SHARP, REGULAR 1467PUT_8TAP_FN regular_sharp, REGULAR, SHARP 1468PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR 1469PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH 1470PUT_8TAP_FN regular, REGULAR, REGULAR 1471 1472cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 1473 imul mxd, mxm, 0x010101 1474 add mxd, t0d ; 8tap_h, mx, 4tap_h 1475 imul myd, mym, 0x010101 1476 add myd, t1d ; 8tap_v, my, 4tap_v 1477 lea r8, [put_avx2] 1478 movsxd wq, wm 1479 movifnidn hd, hm 1480 test mxd, 0xf00 1481 jnz .h 1482 test myd, 0xf00 1483 jnz .v 1484 tzcnt wd, wd 1485 movzx wd, word [r8+wq*2+table_offset(put,)] 1486 add wq, r8 1487 lea r6, [ssq*3] 1488 lea r7, [dsq*3] 1489%if WIN64 1490 pop r8 1491%endif 1492 jmp wq 1493.h: 1494 test myd, 0xf00 1495 jnz .hv 1496 vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) 1497 WIN64_SPILL_XMM 11 1498 cmp wd, 4 1499 jl .h_w2 1500 vbroadcasti128 m6, [subpel_h_shufA] 1501 je .h_w4 1502 tzcnt wd, wd 1503 vbroadcasti128 m7, [subpel_h_shufB] 1504 vbroadcasti128 m8, [subpel_h_shufC] 1505 shr mxd, 16 1506 sub srcq, 3 1507 movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] 1508 vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0] 1509 vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4] 1510 add wq, r8 1511 jmp wq 1512.h_w2: 1513 movzx mxd, mxb 1514 dec srcq 1515 mova xm4, [subpel_h_shuf4] 1516 vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] 1517.h_w2_loop: 1518 movq xm0, [srcq+ssq*0] 1519 movhps xm0, [srcq+ssq*1] 1520 lea srcq, [srcq+ssq*2] 1521 pshufb xm0, xm4 1522 pmaddubsw xm0, xm3 1523 phaddw xm0, xm0 1524 paddw xm0, xm5 1525 psraw xm0, 6 1526 packuswb xm0, xm0 1527 pextrw [dstq+dsq*0], xm0, 0 1528 pextrw [dstq+dsq*1], xm0, 1 1529 lea dstq, [dstq+dsq*2] 1530 sub hd, 2 1531 jg .h_w2_loop 1532 RET 1533.h_w4: 1534 movzx mxd, mxb 1535 dec srcq 1536 vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] 1537.h_w4_loop: 1538 movq xm0, [srcq+ssq*0] 1539 movq xm1, [srcq+ssq*1] 1540 lea srcq, [srcq+ssq*2] 1541 pshufb xm0, xm6 1542 pshufb xm1, xm6 1543 pmaddubsw xm0, xm3 1544 pmaddubsw xm1, xm3 1545 phaddw xm0, xm1 1546 paddw xm0, xm5 1547 psraw xm0, 6 1548 packuswb xm0, xm0 1549 movd [dstq+dsq*0], xm0 1550 pextrd [dstq+dsq*1], xm0, 1 1551 lea dstq, [dstq+dsq*2] 1552 sub hd, 2 1553 jg .h_w4_loop 1554 RET 1555.h_w8: 1556%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] 1557 pshufb m%2, m%1, m7 1558 pshufb m%3, m%1, m8 1559 pshufb m%1, m6 1560 pmaddubsw m%4, m%2, m9 1561 pmaddubsw m%2, m10 1562 pmaddubsw m%3, m10 1563 pmaddubsw m%1, m9 1564 paddw m%3, m%4 1565 paddw m%1, m%2 1566 phaddw m%1, m%3 1567 paddw m%1, m5 1568 psraw m%1, 6 1569%endmacro 1570 movu xm0, [srcq+ssq*0] 1571 vinserti128 m0, [srcq+ssq*1], 1 1572 lea srcq, [srcq+ssq*2] 1573 PUT_8TAP_H 0, 1, 2, 3 1574 vextracti128 xm1, m0, 1 1575 packuswb xm0, xm1 1576 movq [dstq+dsq*0], xm0 1577 movhps [dstq+dsq*1], xm0 1578 lea dstq, [dstq+dsq*2] 1579 sub hd, 2 1580 jg .h_w8 1581 RET 1582.h_w16: 1583 movu xm0, [srcq+ssq*0+8*0] 1584 vinserti128 m0, [srcq+ssq*1+8*0], 1 1585 movu xm1, [srcq+ssq*0+8*1] 1586 vinserti128 m1, [srcq+ssq*1+8*1], 1 1587 PUT_8TAP_H 0, 2, 3, 4 1588 lea srcq, [srcq+ssq*2] 1589 PUT_8TAP_H 1, 2, 3, 4 1590 packuswb m0, m1 1591 mova [dstq+dsq*0], xm0 1592 vextracti128 [dstq+dsq*1], m0, 1 1593 lea dstq, [dstq+dsq*2] 1594 sub hd, 2 1595 jg .h_w16 1596 RET 1597.h_w32: 1598 xor r6d, r6d 1599 jmp .h_start 1600.h_w64: 1601 mov r6, -32*1 1602 jmp .h_start 1603.h_w128: 1604 mov r6, -32*3 1605.h_start: 1606 sub srcq, r6 1607 sub dstq, r6 1608 mov r4, r6 1609.h_loop: 1610 movu m0, [srcq+r6+8*0] 1611 movu m1, [srcq+r6+8*1] 1612 PUT_8TAP_H 0, 2, 3, 4 1613 PUT_8TAP_H 1, 2, 3, 4 1614 packuswb m0, m1 1615 mova [dstq+r6], m0 1616 add r6, 32 1617 jle .h_loop 1618 add srcq, ssq 1619 add dstq, dsq 1620 mov r6, r4 1621 dec hd 1622 jg .h_loop 1623 RET 1624.v: 1625 %assign stack_offset stack_offset - stack_size_padded 1626 WIN64_SPILL_XMM 16 1627 movzx mxd, myb 1628 shr myd, 16 1629 cmp hd, 6 1630 cmovs myd, mxd 1631 tzcnt r6d, wd 1632 movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] 1633 vpbroadcastd m7, [pw_512] 1634 lea myq, [r8+myq*8+subpel_filters-put_avx2] 1635 vpbroadcastw m8, [myq+0] 1636 vpbroadcastw m9, [myq+2] 1637 vpbroadcastw m10, [myq+4] 1638 vpbroadcastw m11, [myq+6] 1639 add r6, r8 1640 lea ss3q, [ssq*3] 1641 sub srcq, ss3q 1642 jmp r6 1643.v_w2: 1644 movd xm2, [srcq+ssq*0] 1645 pinsrw xm2, [srcq+ssq*1], 2 1646 pinsrw xm2, [srcq+ssq*2], 4 1647 add srcq, ss3q 1648 pinsrw xm2, [srcq+ssq*0], 6 ; 0 1 2 3 1649 movd xm3, [srcq+ssq*1] 1650 vpbroadcastd xm1, [srcq+ssq*2] 1651 add srcq, ss3q 1652 vpbroadcastd xm0, [srcq+ssq*0] 1653 vpblendd xm3, xm1, 0x02 ; 4 5 1654 vpblendd xm1, xm0, 0x02 ; 5 6 1655 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 1656 punpcklbw xm3, xm1 ; 45 56 1657 punpcklbw xm1, xm2, xm4 ; 01 12 1658 punpckhbw xm2, xm4 ; 23 34 1659.v_w2_loop: 1660 pmaddubsw xm5, xm1, xm8 ; a0 b0 1661 mova xm1, xm2 1662 pmaddubsw xm2, xm9 ; a1 b1 1663 paddw xm5, xm2 1664 mova xm2, xm3 1665 pmaddubsw xm3, xm10 ; a2 b2 1666 paddw xm5, xm3 1667 vpbroadcastd xm4, [srcq+ssq*1] 1668 lea srcq, [srcq+ssq*2] 1669 vpblendd xm3, xm0, xm4, 0x02 ; 6 7 1670 vpbroadcastd xm0, [srcq+ssq*0] 1671 vpblendd xm4, xm0, 0x02 ; 7 8 1672 punpcklbw xm3, xm4 ; 67 78 1673 pmaddubsw xm4, xm3, xm11 ; a3 b3 1674 paddw xm5, xm4 1675 pmulhrsw xm5, xm7 1676 packuswb xm5, xm5 1677 pextrw [dstq+dsq*0], xm5, 0 1678 pextrw [dstq+dsq*1], xm5, 2 1679 lea dstq, [dstq+dsq*2] 1680 sub hd, 2 1681 jg .v_w2_loop 1682 RET 1683.v_w4: 1684 movd xm2, [srcq+ssq*0] 1685 pinsrd xm2, [srcq+ssq*1], 1 1686 pinsrd xm2, [srcq+ssq*2], 2 1687 add srcq, ss3q 1688 pinsrd xm2, [srcq+ssq*0], 3 ; 0 1 2 3 1689 movd xm3, [srcq+ssq*1] 1690 vpbroadcastd xm1, [srcq+ssq*2] 1691 add srcq, ss3q 1692 vpbroadcastd xm0, [srcq+ssq*0] 1693 vpblendd xm3, xm1, 0x02 ; 4 5 1694 vpblendd xm1, xm0, 0x02 ; 5 6 1695 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 1696 punpcklbw xm3, xm1 ; 45 56 1697 punpcklbw xm1, xm2, xm4 ; 01 12 1698 punpckhbw xm2, xm4 ; 23 34 1699.v_w4_loop: 1700 pmaddubsw xm5, xm1, xm8 ; a0 b0 1701 mova xm1, xm2 1702 pmaddubsw xm2, xm9 ; a1 b1 1703 paddw xm5, xm2 1704 mova xm2, xm3 1705 pmaddubsw xm3, xm10 ; a2 b2 1706 paddw xm5, xm3 1707 vpbroadcastd xm4, [srcq+ssq*1] 1708 lea srcq, [srcq+ssq*2] 1709 vpblendd xm3, xm0, xm4, 0x02 ; 6 7 1710 vpbroadcastd xm0, [srcq+ssq*0] 1711 vpblendd xm4, xm0, 0x02 ; 7 8 1712 punpcklbw xm3, xm4 ; 67 78 1713 pmaddubsw xm4, xm3, xm11 ; a3 b3 1714 paddw xm5, xm4 1715 pmulhrsw xm5, xm7 1716 packuswb xm5, xm5 1717 movd [dstq+dsq*0], xm5 1718 pextrd [dstq+dsq*1], xm5, 1 1719 lea dstq, [dstq+dsq*2] 1720 sub hd, 2 1721 jg .v_w4_loop 1722 RET 1723.v_w8: 1724 movq xm1, [srcq+ssq*0] 1725 vpbroadcastq m4, [srcq+ssq*1] 1726 vpbroadcastq m2, [srcq+ssq*2] 1727 add srcq, ss3q 1728 vpbroadcastq m5, [srcq+ssq*0] 1729 vpbroadcastq m3, [srcq+ssq*1] 1730 vpbroadcastq m6, [srcq+ssq*2] 1731 add srcq, ss3q 1732 vpbroadcastq m0, [srcq+ssq*0] 1733 vpblendd m1, m4, 0x30 1734 vpblendd m4, m2, 0x30 1735 punpcklbw m1, m4 ; 01 12 1736 vpblendd m2, m5, 0x30 1737 vpblendd m5, m3, 0x30 1738 punpcklbw m2, m5 ; 23 34 1739 vpblendd m3, m6, 0x30 1740 vpblendd m6, m0, 0x30 1741 punpcklbw m3, m6 ; 45 56 1742.v_w8_loop: 1743 vpbroadcastq m4, [srcq+ssq*1] 1744 lea srcq, [srcq+ssq*2] 1745 pmaddubsw m5, m1, m8 ; a0 b0 1746 mova m1, m2 1747 pmaddubsw m2, m9 ; a1 b1 1748 paddw m5, m2 1749 mova m2, m3 1750 pmaddubsw m3, m10 ; a2 b2 1751 paddw m5, m3 1752 vpblendd m3, m0, m4, 0x30 1753 vpbroadcastq m0, [srcq+ssq*0] 1754 vpblendd m4, m0, 0x30 1755 punpcklbw m3, m4 ; 67 78 1756 pmaddubsw m4, m3, m11 ; a3 b3 1757 paddw m5, m4 1758 pmulhrsw m5, m7 1759 vextracti128 xm4, m5, 1 1760 packuswb xm5, xm4 1761 movq [dstq+dsq*0], xm5 1762 movhps [dstq+dsq*1], xm5 1763 lea dstq, [dstq+dsq*2] 1764 sub hd, 2 1765 jg .v_w8_loop 1766 RET 1767.v_w16: 1768.v_w32: 1769.v_w64: 1770.v_w128: 1771 lea r6d, [wq*8-128] 1772 mov r4, srcq 1773 mov r7, dstq 1774 lea r6d, [hq+r6*2] 1775.v_w16_loop0: 1776 vbroadcasti128 m4, [srcq+ssq*0] 1777 vbroadcasti128 m5, [srcq+ssq*1] 1778 vbroadcasti128 m6, [srcq+ssq*2] 1779 add srcq, ss3q 1780 vbroadcasti128 m0, [srcq+ssq*0] 1781 vbroadcasti128 m1, [srcq+ssq*1] 1782 vbroadcasti128 m2, [srcq+ssq*2] 1783 add srcq, ss3q 1784 vbroadcasti128 m3, [srcq+ssq*0] 1785 shufpd m4, m0, 0x0c 1786 shufpd m5, m1, 0x0c 1787 punpcklbw m1, m4, m5 ; 01 1788 punpckhbw m4, m5 ; 34 1789 shufpd m6, m2, 0x0c 1790 punpcklbw m2, m5, m6 ; 12 1791 punpckhbw m5, m6 ; 45 1792 shufpd m0, m3, 0x0c 1793 punpcklbw m3, m6, m0 ; 23 1794 punpckhbw m6, m0 ; 56 1795.v_w16_loop: 1796 vbroadcasti128 m12, [srcq+ssq*1] 1797 lea srcq, [srcq+ssq*2] 1798 vbroadcasti128 m13, [srcq+ssq*0] 1799 pmaddubsw m14, m1, m8 ; a0 1800 pmaddubsw m15, m2, m8 ; b0 1801 mova m1, m3 1802 mova m2, m4 1803 pmaddubsw m3, m9 ; a1 1804 pmaddubsw m4, m9 ; b1 1805 paddw m14, m3 1806 paddw m15, m4 1807 mova m3, m5 1808 mova m4, m6 1809 pmaddubsw m5, m10 ; a2 1810 pmaddubsw m6, m10 ; b2 1811 paddw m14, m5 1812 paddw m15, m6 1813 shufpd m6, m0, m12, 0x0d 1814 shufpd m0, m12, m13, 0x0c 1815 punpcklbw m5, m6, m0 ; 67 1816 punpckhbw m6, m0 ; 78 1817 pmaddubsw m12, m5, m11 ; a3 1818 pmaddubsw m13, m6, m11 ; b3 1819 paddw m14, m12 1820 paddw m15, m13 1821 pmulhrsw m14, m7 1822 pmulhrsw m15, m7 1823 packuswb m14, m15 1824 vpermq m14, m14, q3120 1825 mova [dstq+dsq*0], xm14 1826 vextracti128 [dstq+dsq*1], m14, 1 1827 lea dstq, [dstq+dsq*2] 1828 sub hd, 2 1829 jg .v_w16_loop 1830 add r4, 16 1831 add r7, 16 1832 movzx hd, r6b 1833 mov srcq, r4 1834 mov dstq, r7 1835 sub r6d, 1<<8 1836 jg .v_w16_loop0 1837 RET 1838.hv: 1839 %assign stack_offset stack_offset - stack_size_padded 1840 WIN64_SPILL_XMM 16 1841 cmp wd, 4 1842 jg .hv_w8 1843 movzx mxd, mxb 1844 dec srcq 1845 vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2] 1846 movzx mxd, myb 1847 shr myd, 16 1848 cmp hd, 6 1849 cmovs myd, mxd 1850 vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] 1851 lea ss3q, [ssq*3] 1852 sub srcq, ss3q 1853 punpcklbw m0, m0 1854 psraw m0, 8 ; sign-extend 1855 vpbroadcastd m8, [pw_8192] 1856 vpbroadcastd m9, [pd_512] 1857 pshufd m10, m0, q0000 1858 pshufd m11, m0, q1111 1859 pshufd m12, m0, q2222 1860 pshufd m13, m0, q3333 1861 cmp wd, 4 1862 je .hv_w4 1863 vbroadcasti128 m6, [subpel_h_shuf4] 1864 movq xm2, [srcq+ssq*0] 1865 movhps xm2, [srcq+ssq*1] 1866 movq xm0, [srcq+ssq*2] 1867 add srcq, ss3q 1868 movhps xm0, [srcq+ssq*0] 1869 vpbroadcastq m3, [srcq+ssq*1] 1870 vpbroadcastq m4, [srcq+ssq*2] 1871 add srcq, ss3q 1872 vpbroadcastq m1, [srcq+ssq*0] 1873 vpblendd m2, m3, 0x30 1874 vpblendd m0, m1, 0x30 1875 vpblendd m2, m4, 0xc0 1876 pshufb m2, m6 1877 pshufb m0, m6 1878 pmaddubsw m2, m7 1879 pmaddubsw m0, m7 1880 phaddw m2, m0 1881 pmulhrsw m2, m8 1882 vextracti128 xm3, m2, 1 1883 palignr xm4, xm3, xm2, 4 1884 punpcklwd xm1, xm2, xm4 ; 01 12 1885 punpckhwd xm2, xm4 ; 23 34 1886 pshufd xm0, xm3, q2121 1887 punpcklwd xm3, xm0 ; 45 56 1888.hv_w2_loop: 1889 movq xm4, [srcq+ssq*1] 1890 lea srcq, [srcq+ssq*2] 1891 movhps xm4, [srcq+ssq*0] 1892 pshufb xm4, xm6 1893 pmaddubsw xm4, xm7 1894 pmaddwd xm5, xm1, xm10 ; a0 b0 1895 mova xm1, xm2 1896 pmaddwd xm2, xm11 ; a1 b1 1897 paddd xm5, xm2 1898 mova xm2, xm3 1899 pmaddwd xm3, xm12 ; a2 b2 1900 phaddw xm4, xm4 1901 pmulhrsw xm4, xm8 1902 paddd xm5, xm3 1903 palignr xm3, xm4, xm0, 12 1904 mova xm0, xm4 1905 punpcklwd xm3, xm0 ; 67 78 1906 pmaddwd xm4, xm3, xm13 ; a3 b3 1907 paddd xm5, xm9 1908 paddd xm5, xm4 1909 psrad xm5, 10 1910 packssdw xm5, xm5 1911 packuswb xm5, xm5 1912 pextrw [dstq+dsq*0], xm5, 0 1913 pextrw [dstq+dsq*1], xm5, 1 1914 lea dstq, [dstq+dsq*2] 1915 sub hd, 2 1916 jg .hv_w2_loop 1917 RET 1918.hv_w4: 1919 mova m6, [subpel_h_shuf4] 1920 vpbroadcastq m2, [srcq+ssq*0] 1921 vpbroadcastq m4, [srcq+ssq*1] 1922 vpbroadcastq m0, [srcq+ssq*2] 1923 add srcq, ss3q 1924 vpbroadcastq m5, [srcq+ssq*0] 1925 vpbroadcastq m3, [srcq+ssq*1] 1926 vpblendd m2, m4, 0xcc ; 0 1 1927 vpbroadcastq m4, [srcq+ssq*2] 1928 add srcq, ss3q 1929 vpbroadcastq m1, [srcq+ssq*0] 1930 vpblendd m0, m5, 0xcc ; 2 3 1931 vpblendd m3, m4, 0xcc ; 4 5 1932 pshufb m2, m6 1933 pshufb m0, m6 1934 pshufb m3, m6 1935 pshufb m1, m6 1936 pmaddubsw m2, m7 1937 pmaddubsw m0, m7 1938 pmaddubsw m3, m7 1939 pmaddubsw m1, m7 1940 phaddw m2, m0 1941 phaddw m3, m1 1942 pmulhrsw m2, m8 1943 pmulhrsw m3, m8 1944 palignr m4, m3, m2, 4 1945 punpcklwd m1, m2, m4 ; 01 12 1946 punpckhwd m2, m4 ; 23 34 1947 pshufd m0, m3, q2121 1948 punpcklwd m3, m0 ; 45 56 1949.hv_w4_loop: 1950 vpbroadcastq m4, [srcq+ssq*1] 1951 lea srcq, [srcq+ssq*2] 1952 pmaddwd m5, m1, m10 ; a0 b0 1953 mova m1, m2 1954 pmaddwd m2, m11 ; a1 b1 1955 paddd m5, m2 1956 mova m2, m3 1957 pmaddwd m3, m12 ; a2 b2 1958 paddd m5, m3 1959 vpbroadcastq m3, [srcq+ssq*0] 1960 vpblendd m4, m3, 0xcc ; 7 8 1961 pshufb m4, m6 1962 pmaddubsw m4, m7 1963 phaddw m4, m4 1964 pmulhrsw m4, m8 1965 palignr m3, m4, m0, 12 1966 mova m0, m4 1967 punpcklwd m3, m0 ; 67 78 1968 pmaddwd m4, m3, m13 ; a3 b3 1969 paddd m5, m9 1970 paddd m5, m4 1971 psrad m5, 10 1972 vextracti128 xm4, m5, 1 1973 packssdw xm5, xm4 1974 packuswb xm5, xm5 1975 pshuflw xm5, xm5, q3120 1976 movd [dstq+dsq*0], xm5 1977 pextrd [dstq+dsq*1], xm5, 1 1978 lea dstq, [dstq+dsq*2] 1979 sub hd, 2 1980 jg .hv_w4_loop 1981 RET 1982.hv_w8: 1983 shr mxd, 16 1984 sub srcq, 3 1985 vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0] 1986 vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4] 1987 movzx mxd, myb 1988 shr myd, 16 1989 cmp hd, 6 1990 cmovs myd, mxd 1991 vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] 1992 lea ss3q, [ssq*3] 1993 sub srcq, ss3q 1994 punpcklbw m0, m0 1995 psraw m0, 8 ; sign-extend 1996 pshufd m12, m0, q0000 1997 pshufd m13, m0, q1111 1998 pshufd m14, m0, q2222 1999 pshufd m15, m0, q3333 2000 lea r6d, [wq*8-64] 2001 mov r4, srcq 2002 mov r7, dstq 2003 lea r6d, [hq+r6*4] 2004.hv_w8_loop0: 2005 vbroadcasti128 m7, [subpel_h_shufA] 2006 movu xm4, [srcq+ssq*0] 2007 vbroadcasti128 m8, [subpel_h_shufB] 2008 movu xm5, [srcq+ssq*1] 2009 vbroadcasti128 m9, [subpel_h_shufC] 2010 movu xm6, [srcq+ssq*2] 2011 add srcq, ss3q 2012 vbroadcasti128 m0, [srcq+ssq*0] 2013 vpblendd m4, m0, 0xf0 ; 0 3 2014 vinserti128 m5, [srcq+ssq*1], 1 ; 1 4 2015 vinserti128 m6, [srcq+ssq*2], 1 ; 2 5 2016 add srcq, ss3q 2017 vinserti128 m0, [srcq+ssq*0], 1 ; 3 6 2018%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] 2019 pshufb %3, %1, %6 2020 pshufb %4, %1, %7 2021 pshufb %1, %5 2022 pmaddubsw %2, %3, m10 2023 pmaddubsw %4, m11 2024 pmaddubsw %3, m11 2025 pmaddubsw %1, m10 2026 paddw %2, %4 2027 paddw %1, %3 2028 phaddw %1, %2 2029%endmacro 2030 HV_H_W8 m4, m1, m2, m3, m7, m8, m9 2031 HV_H_W8 m5, m1, m2, m3, m7, m8, m9 2032 HV_H_W8 m6, m1, m2, m3, m7, m8, m9 2033 HV_H_W8 m0, m1, m2, m3, m7, m8, m9 2034 vpbroadcastd m7, [pw_8192] 2035 vpermq m4, m4, q3120 2036 vpermq m5, m5, q3120 2037 vpermq m6, m6, q3120 2038 pmulhrsw m0, m7 2039 pmulhrsw m4, m7 2040 pmulhrsw m5, m7 2041 pmulhrsw m6, m7 2042 vpermq m7, m0, q3120 2043 punpcklwd m1, m4, m5 ; 01 2044 punpckhwd m4, m5 ; 34 2045 punpcklwd m2, m5, m6 ; 12 2046 punpckhwd m5, m6 ; 45 2047 punpcklwd m3, m6, m7 ; 23 2048 punpckhwd m6, m7 ; 56 2049.hv_w8_loop: 2050 vextracti128 r6m, m0, 1 ; not enough registers 2051 movu xm0, [srcq+ssq*1] 2052 lea srcq, [srcq+ssq*2] 2053 vinserti128 m0, [srcq+ssq*0], 1 ; 7 8 2054 pmaddwd m8, m1, m12 ; a0 2055 pmaddwd m9, m2, m12 ; b0 2056 mova m1, m3 2057 mova m2, m4 2058 pmaddwd m3, m13 ; a1 2059 pmaddwd m4, m13 ; b1 2060 paddd m8, m3 2061 paddd m9, m4 2062 mova m3, m5 2063 mova m4, m6 2064 pmaddwd m5, m14 ; a2 2065 pmaddwd m6, m14 ; b2 2066 paddd m8, m5 2067 paddd m9, m6 2068 vbroadcasti128 m6, [subpel_h_shufB] 2069 vbroadcasti128 m7, [subpel_h_shufC] 2070 vbroadcasti128 m5, [subpel_h_shufA] 2071 HV_H_W8 m0, m5, m6, m7, m5, m6, m7 2072 vpbroadcastd m5, [pw_8192] 2073 vpbroadcastd m7, [pd_512] 2074 vbroadcasti128 m6, r6m 2075 pmulhrsw m0, m5 2076 paddd m8, m7 2077 paddd m9, m7 2078 vpermq m7, m0, q3120 ; 7 8 2079 shufpd m6, m6, m7, 0x04 ; 6 7 2080 punpcklwd m5, m6, m7 ; 67 2081 punpckhwd m6, m7 ; 78 2082 pmaddwd m7, m5, m15 ; a3 2083 paddd m8, m7 2084 pmaddwd m7, m6, m15 ; b3 2085 paddd m7, m9 2086 psrad m8, 10 2087 psrad m7, 10 2088 packssdw m8, m7 2089 vextracti128 xm7, m8, 1 2090 packuswb xm8, xm7 2091 pshufd xm7, xm8, q3120 2092 movq [dstq+dsq*0], xm7 2093 movhps [dstq+dsq*1], xm7 2094 lea dstq, [dstq+dsq*2] 2095 sub hd, 2 2096 jg .hv_w8_loop 2097 add r4, 8 2098 add r7, 8 2099 movzx hd, r6b 2100 mov srcq, r4 2101 mov dstq, r7 2102 sub r6d, 1<<8 2103 jg .hv_w8_loop0 2104 RET 2105 2106%macro PREP_8TAP_H 0 2107 pshufb m1, m0, m5 2108 pshufb m2, m0, m6 2109 pshufb m3, m0, m7 2110 pmaddubsw m1, m8 2111 pmaddubsw m0, m2, m8 2112 pmaddubsw m2, m9 2113 pmaddubsw m3, m9 2114 paddw m1, m2 2115 paddw m0, m3 2116 phaddw m0, m1, m0 2117 pmulhrsw m0, m4 2118%endmacro 2119 2120%if WIN64 2121DECLARE_REG_TMP 6, 4 2122%else 2123DECLARE_REG_TMP 6, 7 2124%endif 2125 2126%define PREP_8TAP_FN FN prep_8tap, 2127 2128PREP_8TAP_FN sharp, SHARP, SHARP 2129PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH 2130PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP 2131PREP_8TAP_FN smooth, SMOOTH, SMOOTH 2132PREP_8TAP_FN sharp_regular, SHARP, REGULAR 2133PREP_8TAP_FN regular_sharp, REGULAR, SHARP 2134PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR 2135PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH 2136PREP_8TAP_FN regular, REGULAR, REGULAR 2137 2138cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 2139 imul mxd, mxm, 0x010101 2140 add mxd, t0d ; 8tap_h, mx, 4tap_h 2141 imul myd, mym, 0x010101 2142 add myd, t1d ; 8tap_v, my, 4tap_v 2143 lea r7, [prep%+SUFFIX] 2144 movsxd wq, wm 2145 movifnidn hd, hm 2146 test mxd, 0xf00 2147 jnz .h 2148 test myd, 0xf00 2149 jnz .v 2150 tzcnt wd, wd 2151 movzx wd, word [r7+wq*2+table_offset(prep,)] 2152 add wq, r7 2153 lea r6, [strideq*3] 2154%if WIN64 2155 pop r7 2156%endif 2157 jmp wq 2158.h: 2159 test myd, 0xf00 2160 jnz .hv 2161 vpbroadcastd m4, [pw_8192] 2162 vbroadcasti128 m5, [subpel_h_shufA] 2163 WIN64_SPILL_XMM 10 2164 cmp wd, 4 2165 je .h_w4 2166 tzcnt wd, wd 2167 vbroadcasti128 m6, [subpel_h_shufB] 2168 vbroadcasti128 m7, [subpel_h_shufC] 2169 shr mxd, 16 2170 sub srcq, 3 2171 movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] 2172 vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] 2173 vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] 2174 add wq, r7 2175 jmp wq 2176.h_w4: 2177 movzx mxd, mxb 2178 dec srcq 2179 vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] 2180 lea stride3q, [strideq*3] 2181.h_w4_loop: 2182 movq xm0, [srcq+strideq*0] 2183 vpbroadcastq m2, [srcq+strideq*2] 2184 movq xm1, [srcq+strideq*1] 2185 vpblendd m0, m2, 0xf0 2186 vpbroadcastq m2, [srcq+stride3q ] 2187 lea srcq, [srcq+strideq*4] 2188 vpblendd m1, m2, 0xf0 2189 pshufb m0, m5 2190 pshufb m1, m5 2191 pmaddubsw m0, m6 2192 pmaddubsw m1, m6 2193 phaddw m0, m1 2194 pmulhrsw m0, m4 2195 mova [tmpq], m0 2196 add tmpq, 32 2197 sub hd, 4 2198 jg .h_w4_loop 2199 RET 2200.h_w8: 2201 movu xm0, [srcq+strideq*0] 2202 vinserti128 m0, [srcq+strideq*1], 1 2203 lea srcq, [srcq+strideq*2] 2204 PREP_8TAP_H 2205 mova [tmpq], m0 2206 add tmpq, 32 2207 sub hd, 2 2208 jg .h_w8 2209 RET 2210.h_w16: 2211 movu xm0, [srcq+strideq*0+8*0] 2212 vinserti128 m0, [srcq+strideq*0+8*1], 1 2213 PREP_8TAP_H 2214 mova [tmpq+32*0], m0 2215 movu xm0, [srcq+strideq*1+8*0] 2216 vinserti128 m0, [srcq+strideq*1+8*1], 1 2217 lea srcq, [srcq+strideq*2] 2218 PREP_8TAP_H 2219 mova [tmpq+32*1], m0 2220 add tmpq, 32*2 2221 sub hd, 2 2222 jg .h_w16 2223 RET 2224.h_w32: 2225 xor r6d, r6d 2226 jmp .h_start 2227.h_w64: 2228 mov r6, -32*1 2229 jmp .h_start 2230.h_w128: 2231 mov r6, -32*3 2232.h_start: 2233 sub srcq, r6 2234 mov r5, r6 2235.h_loop: 2236 movu xm0, [srcq+r6+8*0] 2237 vinserti128 m0, [srcq+r6+8*1], 1 2238 PREP_8TAP_H 2239 mova [tmpq+32*0], m0 2240 movu xm0, [srcq+r6+8*2] 2241 vinserti128 m0, [srcq+r6+8*3], 1 2242 PREP_8TAP_H 2243 mova [tmpq+32*1], m0 2244 add tmpq, 32*2 2245 add r6, 32 2246 jle .h_loop 2247 add srcq, strideq 2248 mov r6, r5 2249 dec hd 2250 jg .h_loop 2251 RET 2252.v: 2253 %assign stack_offset stack_offset - stack_size_padded 2254 WIN64_SPILL_XMM 16 2255 movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. 2256 shr myd, 16 ; Note that the code is 8-tap only, having 2257 cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 2258 cmove myd, mxd ; had a negligible effect on performance. 2259 ; TODO: Would a 6-tap code path be worth it? 2260 lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX] 2261 lea stride3q, [strideq*3] 2262 sub srcq, stride3q 2263 vpbroadcastd m7, [pw_8192] 2264 vpbroadcastw m8, [myq+0] 2265 vpbroadcastw m9, [myq+2] 2266 vpbroadcastw m10, [myq+4] 2267 vpbroadcastw m11, [myq+6] 2268 cmp wd, 8 2269 jg .v_w16 2270 je .v_w8 2271.v_w4: 2272 movd xm0, [srcq+strideq*0] 2273 vpbroadcastd m1, [srcq+strideq*2] 2274 vpbroadcastd xm2, [srcq+strideq*1] 2275 add srcq, stride3q 2276 vpbroadcastd m3, [srcq+strideq*0] 2277 vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _ 2278 vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _ 2279 vpbroadcastd m0, [srcq+strideq*1] 2280 vpbroadcastd m2, [srcq+strideq*2] 2281 vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _ 2282 vpbroadcastd m0, [srcq+stride3q ] 2283 vbroadcasti128 m5, [deint_shuf4] 2284 vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5 2285 vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5 2286 vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _ 2287 punpcklbw m1, m2, m3 ; 01 12 23 34 2288 vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6 2289 punpckhbw m2, m3 ; 23 34 45 56 2290.v_w4_loop: 2291 lea srcq, [srcq+strideq*4] 2292 pinsrd xm0, [srcq+strideq*0], 1 2293 vpbroadcastd m3, [srcq+strideq*1] 2294 vpbroadcastd m4, [srcq+strideq*2] 2295 vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 _ _ _ 2296 vpbroadcastd m0, [srcq+stride3q ] 2297 vpblendd m3, m4, 0x20 ; 6 7 8 _ 8 9 _ _ 2298 vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _ 2299 pshufb m3, m5 ; 67 78 89 9a 2300 pmaddubsw m4, m1, m8 2301 vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78 2302 pmaddubsw m2, m9 2303 paddw m4, m2 2304 mova m2, m3 2305 pmaddubsw m3, m11 2306 paddw m3, m4 2307 pmaddubsw m4, m1, m10 2308 paddw m3, m4 2309 pmulhrsw m3, m7 2310 mova [tmpq], m3 2311 add tmpq, 32 2312 sub hd, 4 2313 jg .v_w4_loop 2314 RET 2315.v_w8: 2316 movq xm1, [srcq+strideq*0] 2317 vpbroadcastq m4, [srcq+strideq*1] 2318 vpbroadcastq m2, [srcq+strideq*2] 2319 vpbroadcastq m5, [srcq+stride3q ] 2320 lea srcq, [srcq+strideq*4] 2321 vpbroadcastq m3, [srcq+strideq*0] 2322 vpbroadcastq m6, [srcq+strideq*1] 2323 vpbroadcastq m0, [srcq+strideq*2] 2324 vpblendd m1, m4, 0x30 2325 vpblendd m4, m2, 0x30 2326 punpcklbw m1, m4 ; 01 12 2327 vpblendd m2, m5, 0x30 2328 vpblendd m5, m3, 0x30 2329 punpcklbw m2, m5 ; 23 34 2330 vpblendd m3, m6, 0x30 2331 vpblendd m6, m0, 0x30 2332 punpcklbw m3, m6 ; 45 56 2333.v_w8_loop: 2334 vpbroadcastq m4, [srcq+stride3q ] 2335 lea srcq, [srcq+strideq*4] 2336 pmaddubsw m5, m2, m9 ; a1 2337 pmaddubsw m6, m2, m8 ; b0 2338 vpblendd m2, m0, m4, 0x30 2339 vpbroadcastq m0, [srcq+strideq*0] 2340 vpblendd m4, m0, 0x30 2341 punpcklbw m2, m4 ; 67 78 2342 pmaddubsw m1, m8 ; a0 2343 pmaddubsw m4, m3, m9 ; b1 2344 paddw m5, m1 2345 mova m1, m3 2346 pmaddubsw m3, m10 ; a2 2347 paddw m6, m4 2348 paddw m5, m3 2349 vpbroadcastq m4, [srcq+strideq*1] 2350 vpblendd m3, m0, m4, 0x30 2351 vpbroadcastq m0, [srcq+strideq*2] 2352 vpblendd m4, m0, 0x30 2353 punpcklbw m3, m4 ; 89 9a 2354 pmaddubsw m4, m2, m11 ; a3 2355 paddw m5, m4 2356 pmaddubsw m4, m2, m10 ; b2 2357 paddw m6, m4 2358 pmaddubsw m4, m3, m11 ; b3 2359 paddw m6, m4 2360 pmulhrsw m5, m7 2361 pmulhrsw m6, m7 2362 mova [tmpq+32*0], m5 2363 mova [tmpq+32*1], m6 2364 add tmpq, 32*2 2365 sub hd, 4 2366 jg .v_w8_loop 2367 RET 2368.v_w16: 2369 add wd, wd 2370 mov r5, srcq 2371 mov r7, tmpq 2372 lea r6d, [hq+wq*8-256] 2373.v_w16_loop0: 2374 vbroadcasti128 m4, [srcq+strideq*0] 2375 vbroadcasti128 m5, [srcq+strideq*1] 2376 lea srcq, [srcq+strideq*2] 2377 vbroadcasti128 m0, [srcq+strideq*1] 2378 vbroadcasti128 m6, [srcq+strideq*0] 2379 lea srcq, [srcq+strideq*2] 2380 vbroadcasti128 m1, [srcq+strideq*0] 2381 vbroadcasti128 m2, [srcq+strideq*1] 2382 lea srcq, [srcq+strideq*2] 2383 vbroadcasti128 m3, [srcq+strideq*0] 2384 shufpd m4, m4, m0, 0x0c 2385 shufpd m5, m5, m1, 0x0c 2386 punpcklbw m1, m4, m5 ; 01 2387 punpckhbw m4, m5 ; 34 2388 shufpd m6, m6, m2, 0x0c 2389 punpcklbw m2, m5, m6 ; 12 2390 punpckhbw m5, m6 ; 45 2391 shufpd m0, m0, m3, 0x0c 2392 punpcklbw m3, m6, m0 ; 23 2393 punpckhbw m6, m0 ; 56 2394.v_w16_loop: 2395 vbroadcasti128 m12, [srcq+strideq*1] 2396 lea srcq, [srcq+strideq*2] 2397 vbroadcasti128 m13, [srcq+strideq*0] 2398 pmaddubsw m14, m1, m8 ; a0 2399 pmaddubsw m15, m2, m8 ; b0 2400 mova m1, m3 2401 mova m2, m4 2402 pmaddubsw m3, m9 ; a1 2403 pmaddubsw m4, m9 ; b1 2404 paddw m14, m3 2405 paddw m15, m4 2406 mova m3, m5 2407 mova m4, m6 2408 pmaddubsw m5, m10 ; a2 2409 pmaddubsw m6, m10 ; b2 2410 paddw m14, m5 2411 paddw m15, m6 2412 shufpd m6, m0, m12, 0x0d 2413 shufpd m0, m12, m13, 0x0c 2414 punpcklbw m5, m6, m0 ; 67 2415 punpckhbw m6, m0 ; 78 2416 pmaddubsw m12, m5, m11 ; a3 2417 pmaddubsw m13, m6, m11 ; b3 2418 paddw m14, m12 2419 paddw m15, m13 2420 pmulhrsw m14, m7 2421 pmulhrsw m15, m7 2422 mova [tmpq+wq*0], m14 2423 mova [tmpq+wq*1], m15 2424 lea tmpq, [tmpq+wq*2] 2425 sub hd, 2 2426 jg .v_w16_loop 2427 add r5, 16 2428 add r7, 32 2429 movzx hd, r6b 2430 mov srcq, r5 2431 mov tmpq, r7 2432 sub r6d, 1<<8 2433 jg .v_w16_loop0 2434 RET 2435.hv: 2436 %assign stack_offset stack_offset - stack_size_padded 2437 %assign stack_size_padded 0 2438 WIN64_SPILL_XMM 16 2439 cmp wd, 4 2440 je .hv_w4 2441 shr mxd, 16 2442 sub srcq, 3 2443 vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] 2444 vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] 2445 movzx mxd, myb 2446 shr myd, 16 2447 cmp hd, 4 2448 cmove myd, mxd 2449 vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] 2450 lea stride3q, [strideq*3] 2451 sub srcq, stride3q 2452 punpcklbw m0, m0 2453 psraw m0, 8 ; sign-extend 2454 pshufd m12, m0, q0000 2455 pshufd m13, m0, q1111 2456 pshufd m14, m0, q2222 2457 pshufd m15, m0, q3333 2458 jmp .hv_w8 2459.hv_w4: 2460 movzx mxd, mxb 2461 dec srcq 2462 vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] 2463 movzx mxd, myb 2464 shr myd, 16 2465 cmp hd, 4 2466 cmove myd, mxd 2467 vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] 2468 lea stride3q, [strideq*3] 2469 sub srcq, stride3q 2470 mova m7, [subpel_h_shuf4] 2471 pmovzxbd m9, [deint_shuf4] 2472 vpbroadcastd m10, [pw_8192] 2473 punpcklbw m0, m0 2474 psraw m0, 8 ; sign-extend 2475 vpbroadcastd m11, [pd_32] 2476 pshufd m12, m0, q0000 2477 pshufd m13, m0, q1111 2478 pshufd m14, m0, q2222 2479 pshufd m15, m0, q3333 2480 vpbroadcastq m2, [srcq+strideq*0] 2481 vpbroadcastq m4, [srcq+strideq*1] 2482 vpbroadcastq m0, [srcq+strideq*2] 2483 vpbroadcastq m5, [srcq+stride3q ] 2484 lea srcq, [srcq+strideq*4] 2485 vpbroadcastq m3, [srcq+strideq*0] 2486 vpbroadcastq m6, [srcq+strideq*1] 2487 vpbroadcastq m1, [srcq+strideq*2] 2488 vpblendd m2, m4, 0xcc ; 0 1 2489 vpblendd m0, m5, 0xcc ; 2 3 2490 vpblendd m3, m6, 0xcc ; 4 5 2491 pshufb m2, m7 ; 00 01 10 11 02 03 12 13 2492 pshufb m0, m7 ; 20 21 30 31 22 23 32 33 2493 pshufb m3, m7 ; 40 41 50 51 42 43 52 53 2494 pshufb m1, m7 ; 60 61 60 61 62 63 62 63 2495 pmaddubsw m2, m8 2496 pmaddubsw m0, m8 2497 pmaddubsw m3, m8 2498 pmaddubsw m1, m8 2499 phaddw m2, m0 ; 0a 1a 2a 3a 0b 1b 2b 3b 2500 phaddw m3, m1 ; 4a 5a 6a __ 4b 5b 6b __ 2501 pmulhrsw m2, m10 2502 pmulhrsw m3, m10 2503 palignr m4, m3, m2, 4 ; 1a 2a 3a 4a 1b 2b 3b 4b 2504 punpcklwd m1, m2, m4 ; 01 12 2505 punpckhwd m2, m4 ; 23 34 2506 pshufd m0, m3, q2121 2507 punpcklwd m3, m0 ; 45 56 2508.hv_w4_loop: 2509 pmaddwd m5, m1, m12 ; a0 b0 2510 pmaddwd m6, m2, m12 ; c0 d0 2511 pmaddwd m2, m13 ; a1 b1 2512 pmaddwd m4, m3, m13 ; c1 d1 2513 mova m1, m3 2514 pmaddwd m3, m14 ; a2 b2 2515 paddd m5, m2 2516 vpbroadcastq m2, [srcq+stride3q ] 2517 lea srcq, [srcq+strideq*4] 2518 paddd m6, m4 2519 vpbroadcastq m4, [srcq+strideq*0] 2520 paddd m5, m3 2521 vpbroadcastq m3, [srcq+strideq*1] 2522 vpblendd m2, m4, 0xcc 2523 vpbroadcastq m4, [srcq+strideq*2] 2524 vpblendd m3, m4, 0xcc 2525 pshufb m2, m7 2526 pshufb m3, m7 2527 pmaddubsw m2, m8 2528 pmaddubsw m3, m8 2529 phaddw m2, m3 2530 pmulhrsw m2, m10 2531 palignr m3, m2, m0, 12 2532 mova m0, m2 2533 punpcklwd m2, m3, m0 ; 67 78 2534 punpckhwd m3, m0 ; 89 9a 2535 pmaddwd m4, m2, m14 ; c2 d2 2536 paddd m6, m11 2537 paddd m5, m11 2538 paddd m6, m4 2539 pmaddwd m4, m2, m15 ; a3 b3 2540 paddd m5, m4 2541 pmaddwd m4, m3, m15 ; c3 d3 2542 paddd m6, m4 2543 psrad m5, 6 2544 psrad m6, 6 2545 packssdw m5, m6 2546 vpermd m5, m9, m5 2547 mova [tmpq], m5 2548 add tmpq, 32 2549 sub hd, 4 2550 jg .hv_w4_loop 2551 RET 2552.hv_w8: 2553 lea r6d, [wq*8-64] 2554 mov r5, srcq 2555 mov r7, tmpq 2556 lea r6d, [hq+r6*4] 2557.hv_w8_loop0: 2558 vbroadcasti128 m7, [subpel_h_shufA] 2559 movu xm4, [srcq+strideq*0] 2560 vbroadcasti128 m8, [subpel_h_shufB] 2561 movu xm5, [srcq+strideq*1] 2562 lea srcq, [srcq+strideq*2] 2563 vbroadcasti128 m9, [subpel_h_shufC] 2564 movu xm6, [srcq+strideq*0] 2565 vbroadcasti128 m0, [srcq+strideq*1] 2566 lea srcq, [srcq+strideq*2] 2567 vpblendd m4, m0, 0xf0 ; 0 3 2568 vinserti128 m5, [srcq+strideq*0], 1 ; 1 4 2569 vinserti128 m6, [srcq+strideq*1], 1 ; 2 5 2570 lea srcq, [srcq+strideq*2] 2571 vinserti128 m0, [srcq+strideq*0], 1 ; 3 6 2572 HV_H_W8 m4, m1, m2, m3, m7, m8, m9 2573 HV_H_W8 m5, m1, m2, m3, m7, m8, m9 2574 HV_H_W8 m6, m1, m2, m3, m7, m8, m9 2575 HV_H_W8 m0, m1, m2, m3, m7, m8, m9 2576 vpbroadcastd m7, [pw_8192] 2577 vpermq m4, m4, q3120 2578 vpermq m5, m5, q3120 2579 vpermq m6, m6, q3120 2580 pmulhrsw m0, m7 2581 pmulhrsw m4, m7 2582 pmulhrsw m5, m7 2583 pmulhrsw m6, m7 2584 vpermq m7, m0, q3120 2585 punpcklwd m1, m4, m5 ; 01 2586 punpckhwd m4, m5 ; 34 2587 punpcklwd m2, m5, m6 ; 12 2588 punpckhwd m5, m6 ; 45 2589 punpcklwd m3, m6, m7 ; 23 2590 punpckhwd m6, m7 ; 56 2591.hv_w8_loop: 2592 vextracti128 [tmpq], m0, 1 ; not enough registers 2593 movu xm0, [srcq+strideq*1] 2594 lea srcq, [srcq+strideq*2] 2595 vinserti128 m0, [srcq+strideq*0], 1 ; 7 8 2596 pmaddwd m8, m1, m12 ; a0 2597 pmaddwd m9, m2, m12 ; b0 2598 mova m1, m3 2599 mova m2, m4 2600 pmaddwd m3, m13 ; a1 2601 pmaddwd m4, m13 ; b1 2602 paddd m8, m3 2603 paddd m9, m4 2604 mova m3, m5 2605 mova m4, m6 2606 pmaddwd m5, m14 ; a2 2607 pmaddwd m6, m14 ; b2 2608 paddd m8, m5 2609 paddd m9, m6 2610 vbroadcasti128 m6, [subpel_h_shufB] 2611 vbroadcasti128 m7, [subpel_h_shufC] 2612 vbroadcasti128 m5, [subpel_h_shufA] 2613 HV_H_W8 m0, m5, m6, m7, m5, m6, m7 2614 vpbroadcastd m5, [pw_8192] 2615 vpbroadcastd m7, [pd_32] 2616 vbroadcasti128 m6, [tmpq] 2617 pmulhrsw m0, m5 2618 paddd m8, m7 2619 paddd m9, m7 2620 vpermq m7, m0, q3120 ; 7 8 2621 shufpd m6, m6, m7, 0x04 ; 6 7 2622 punpcklwd m5, m6, m7 ; 67 2623 punpckhwd m6, m7 ; 78 2624 pmaddwd m7, m5, m15 ; a3 2625 paddd m8, m7 2626 pmaddwd m7, m6, m15 ; b3 2627 paddd m7, m9 2628 psrad m8, 6 2629 psrad m7, 6 2630 packssdw m8, m7 2631 vpermq m7, m8, q3120 2632 mova [tmpq+wq*0], xm7 2633 vextracti128 [tmpq+wq*2], m7, 1 2634 lea tmpq, [tmpq+wq*4] 2635 sub hd, 2 2636 jg .hv_w8_loop 2637 add r5, 8 2638 add r7, 16 2639 movzx hd, r6b 2640 mov srcq, r5 2641 mov tmpq, r7 2642 sub r6d, 1<<8 2643 jg .hv_w8_loop0 2644 RET 2645 2646%macro movifprep 2 2647 %if isprep 2648 mov %1, %2 2649 %endif 2650%endmacro 2651 2652%macro REMAP_REG 2 2653 %xdefine r%1 r%2 2654 %xdefine r%1q r%2q 2655 %xdefine r%1d r%2d 2656%endmacro 2657 2658%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 2659 %if isprep 2660 %xdefine r14_save r14 2661 %assign %%i 14 2662 %rep 14 2663 %assign %%j %%i-1 2664 REMAP_REG %%i, %%j 2665 %assign %%i %%i-1 2666 %endrep 2667 %endif 2668%endmacro 2669 2670%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 2671 %if isprep 2672 %assign %%i 1 2673 %rep 13 2674 %assign %%j %%i+1 2675 REMAP_REG %%i, %%j 2676 %assign %%i %%i+1 2677 %endrep 2678 %xdefine r14 r14_save 2679 %undef r14_save 2680 %endif 2681%endmacro 2682 2683%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged 2684 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 2685 RET 2686 %if %1 2687 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 2688 %endif 2689%endmacro 2690 2691%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] 2692 movq xm%1, [srcq+ r4] 2693 movq xm%2, [srcq+ r6] 2694 movhps xm%1, [srcq+ r7] 2695 movhps xm%2, [srcq+ r9] 2696 vinserti128 m%1, [srcq+r10], 1 2697 vinserti128 m%2, [srcq+r11], 1 2698 vpbroadcastq m%5, [srcq+r13] 2699 vpbroadcastq m%6, [srcq+ rX] 2700 add srcq, ssq 2701 movq xm%3, [srcq+ r4] 2702 movq xm%4, [srcq+ r6] 2703 movhps xm%3, [srcq+ r7] 2704 movhps xm%4, [srcq+ r9] 2705 vinserti128 m%3, [srcq+r10], 1 2706 vinserti128 m%4, [srcq+r11], 1 2707 vpbroadcastq m%7, [srcq+r13] 2708 vpbroadcastq m%8, [srcq+ rX] 2709 add srcq, ssq 2710 vpblendd m%1, m%5, 0xc0 2711 vpblendd m%2, m%6, 0xc0 2712 vpblendd m%3, m%7, 0xc0 2713 vpblendd m%4, m%8, 0xc0 2714 pmaddubsw m%1, m15 2715 pmaddubsw m%2, m10 2716 pmaddubsw m%3, m15 2717 pmaddubsw m%4, m10 2718 phaddw m%1, m%2 2719 phaddw m%3, m%4 2720 phaddw m%1, m%3 2721 pmulhrsw m%1, m12 2722%endmacro 2723 2724%macro MC_8TAP_SCALED 1 2725%ifidn %1, put 2726 %assign isprep 0 2727 %if required_stack_alignment <= STACK_ALIGNMENT 2728cglobal put_8tap_scaled, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy 2729 %else 2730cglobal put_8tap_scaled, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy 2731 %endif 2732 %xdefine base_reg r12 2733 %define rndshift 10 2734%else 2735 %assign isprep 1 2736 %if required_stack_alignment <= STACK_ALIGNMENT 2737cglobal prep_8tap_scaled, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy 2738 %xdefine tmp_stridem r14q 2739 %else 2740cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy 2741 %define tmp_stridem qword [rsp+120] 2742 %endif 2743 %xdefine base_reg r11 2744 %define rndshift 6 2745%endif 2746 lea base_reg, [%1_8tap_scaled_avx2] 2747%define base base_reg-%1_8tap_scaled_avx2 2748 tzcnt wd, wm 2749 vpbroadcastd m8, dxm 2750%if isprep && UNIX64 2751 movd xm14, mxd 2752 vpbroadcastd m14, xm14 2753 mov r5d, t0d 2754 DECLARE_REG_TMP 5, 7 2755%else 2756 vpbroadcastd m14, mxm 2757%endif 2758 mov dyd, dym 2759%ifidn %1, put 2760 %if WIN64 2761 mov r8d, hm 2762 DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 2763 %define hm r5m 2764 %define dxm r8m 2765 %else 2766 DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 2767 %define hm r6m 2768 %endif 2769 %if required_stack_alignment > STACK_ALIGNMENT 2770 %define dsm [rsp+112] 2771 %define rX r1 2772 %define rXd r1d 2773 %else 2774 %define dsm dsq 2775 %define rX r14 2776 %define rXd r14d 2777 %endif 2778%else ; prep 2779 %if WIN64 2780 mov r7d, hm 2781 DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 2782 %define hm r4m 2783 %define dxm r7m 2784 %else 2785 DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 2786 %define hm [rsp+112] 2787 %endif 2788 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 2789 %define rX r14 2790 %define rXd r14d 2791%endif 2792 vpbroadcastd m10, [base+pd_0x3ff] 2793 vpbroadcastd m12, [base+pw_8192] 2794%ifidn %1, put 2795 vpbroadcastd m13, [base+pd_512] 2796%else 2797 vpbroadcastd m13, [base+pd_32] 2798%endif 2799 pxor m9, m9 2800 lea ss3q, [ssq*3] 2801 movzx r7d, t1b 2802 shr t1d, 16 2803 cmp hd, 6 2804 cmovs t1d, r7d 2805 sub srcq, ss3q 2806 cmp dyd, 1024 2807 je .dy1 2808 cmp dyd, 2048 2809 je .dy2 2810 movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] 2811 add wq, base_reg 2812 jmp wq 2813%ifidn %1, put 2814.w2: 2815 mov myd, mym 2816 movzx t0d, t0b 2817 dec srcq 2818 movd xm15, t0d 2819 punpckldq m8, m9, m8 2820 paddd m14, m8 ; mx+dx*[0-1] 2821 vpbroadcastd m11, [base+pd_0x4000] 2822 vpbroadcastd xm15, xm15 2823 pand m8, m14, m10 2824 psrld m8, 6 2825 paddd xm15, xm8 2826 movd r4d, xm15 2827 pextrd r6d, xm15, 1 2828 vbroadcasti128 m5, [base+bdct_lb_dw] 2829 vbroadcasti128 m6, [base+subpel_s_shuf2] 2830 vpbroadcastd m15, [base+subpel_filters+r4*8+2] 2831 vpbroadcastd m7, [base+subpel_filters+r6*8+2] 2832 pcmpeqd m8, m9 2833 psrld m14, 10 2834 movq xm0, [srcq+ssq*0] 2835 movq xm1, [srcq+ssq*2] 2836 movhps xm0, [srcq+ssq*1] 2837 movhps xm1, [srcq+ss3q ] 2838 lea srcq, [srcq+ssq*4] 2839 pshufb m14, m5 2840 paddb m14, m6 2841 vinserti128 m0, [srcq+ssq*0], 1 2842 vinserti128 m1, [srcq+ssq*2], 1 2843 vpbroadcastq m2, [srcq+ssq*1] 2844 vpbroadcastq m3, [srcq+ss3q ] 2845 lea srcq, [srcq+ssq*4] 2846 vpblendd m15, m7, 0xaa 2847 vpblendd m0, m2, 0xc0 ; 0 1 4 5 2848 vpblendd m1, m3, 0xc0 ; 2 3 6 7 2849 pblendvb m15, m11, m8 2850 pshufb m0, m14 2851 pshufb m1, m14 2852 pmaddubsw m0, m15 2853 pmaddubsw m1, m15 2854 phaddw m0, m1 2855 pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7 2856 vextracti128 xm1, m0, 1 ; 4 5 6 7 2857 palignr xm2, xm1, xm0, 4 ; 1 2 3 4 2858 punpcklwd xm3, xm0, xm2 ; 01 12 2859 punpckhwd xm0, xm2 ; 23 34 2860 pshufd xm4, xm1, q0321 ; 5 6 7 _ 2861 punpcklwd xm2, xm1, xm4 ; 45 56 2862 punpckhwd xm4, xm1, xm4 ; 67 __ 2863.w2_loop: 2864 and myd, 0x3ff 2865 mov r6d, 64 << 24 2866 mov r4d, myd 2867 shr r4d, 6 2868 lea r4d, [t1+r4] 2869 cmovnz r6q, [base+subpel_filters+r4*8] 2870 movq xm11, r6q 2871 punpcklbw xm11, xm11 2872 psraw xm11, 8 2873 pshufd xm8, xm11, q0000 2874 pshufd xm9, xm11, q1111 2875 pshufd xm10, xm11, q2222 2876 pshufd xm11, xm11, q3333 2877 pmaddwd xm5, xm3, xm8 2878 pmaddwd xm6, xm0, xm9 2879 pmaddwd xm7, xm2, xm10 2880 pmaddwd xm8, xm4, xm11 2881 paddd xm5, xm6 2882 paddd xm7, xm8 2883 paddd xm5, xm13 2884 paddd xm5, xm7 2885 psrad xm5, 10 2886 packssdw xm5, xm5 2887 packuswb xm5, xm5 2888 pextrw [dstq], xm5, 0 2889 add dstq, dsq 2890 dec hd 2891 jz .ret 2892 add myd, dyd 2893 test myd, ~0x3ff 2894 jz .w2_loop 2895 movq xm5, [srcq] 2896 test myd, 0x400 2897 jz .w2_skip_line 2898 add srcq, ssq 2899 shufps xm3, xm0, q1032 ; 01 12 2900 shufps xm0, xm2, q1032 ; 23 34 2901 shufps xm2, xm4, q1032 ; 45 56 2902 pshufb xm5, xm14 2903 pmaddubsw xm5, xm15 2904 phaddw xm5, xm5 2905 pmulhrsw xm5, xm12 2906 palignr xm1, xm5, xm1, 12 2907 punpcklqdq xm1, xm1 ; 6 7 6 7 2908 punpcklwd xm4, xm1, xm5 ; 67 __ 2909 jmp .w2_loop 2910.w2_skip_line: 2911 movhps xm5, [srcq+ssq*1] 2912 lea srcq, [srcq+ssq*2] 2913 mova xm3, xm0 ; 01 12 2914 mova xm0, xm2 ; 23 34 2915 pshufb xm5, xm14 2916 pmaddubsw xm5, xm15 2917 phaddw xm5, xm5 2918 pmulhrsw xm5, xm12 ; 6 7 6 7 2919 palignr xm1, xm5, xm1, 8 ; 4 5 6 7 2920 pshufd xm5, xm1, q0321 ; 5 6 7 _ 2921 punpcklwd xm2, xm1, xm5 ; 45 56 2922 punpckhwd xm4, xm1, xm5 ; 67 __ 2923 jmp .w2_loop 2924%endif 2925.w4: 2926 mov myd, mym 2927 vbroadcasti128 m7, [base+rescale_mul] 2928 movzx t0d, t0b 2929 dec srcq 2930 movd xm15, t0d 2931 pmaddwd m8, m7 2932 vpbroadcastd m11, [base+pd_0x4000] 2933 vpbroadcastd xm15, xm15 2934 paddd m14, m8 ; mx+dx*[0-3] 2935 pand m0, m14, m10 2936 psrld m0, 6 2937 paddd xm15, xm0 2938 movd r4d, xm15 2939 pextrd r6d, xm15, 1 2940 pextrd r11d, xm15, 2 2941 pextrd r13d, xm15, 3 2942 movd xm15, [base+subpel_filters+r4*8+2] 2943 vbroadcasti128 m5, [base+bdct_lb_dw] 2944 vpbroadcastq m6, [base+subpel_s_shuf2] 2945 pinsrd xm15, [base+subpel_filters+r6*8+2], 1 2946 pcmpeqd m0, m9 2947 psrld m14, 10 2948 movu xm7, [srcq+ssq*0] 2949 movu xm9, [srcq+ssq*1] 2950 pinsrd xm15, [base+subpel_filters+r11*8+2], 2 2951 movu xm8, [srcq+ssq*2] 2952 movu xm10, [srcq+ss3q ] 2953 pinsrd xm15, [base+subpel_filters+r13*8+2], 3 2954 lea srcq, [srcq+ssq*4] 2955 pshufb m14, m5 2956 paddb m14, m6 2957 vinserti128 m7, [srcq+ssq*0], 1 2958 vinserti128 m9, [srcq+ssq*1], 1 2959 vinserti128 m15, xm15, 1 2960 vinserti128 m8, [srcq+ssq*2], 1 2961 vinserti128 m10, [srcq+ss3q ], 1 2962 lea srcq, [srcq+ssq*4] 2963 pblendvb m15, m11, m0 2964 pshufb m7, m14 2965 pshufb m9, m14 2966 pshufb m8, m14 2967 pshufb m10, m14 2968 pmaddubsw m7, m15 2969 pmaddubsw m9, m15 2970 pmaddubsw m8, m15 2971 pmaddubsw m10, m15 2972 phaddw m7, m9 2973 phaddw m8, m10 2974 pmulhrsw m7, m12 ; 0 1 4 5 2975 pmulhrsw m8, m12 ; 2 3 6 7 2976 vextracti128 xm9, m7, 1 ; 4 5 2977 vextracti128 xm3, m8, 1 ; 6 7 2978 shufps xm4, xm7, xm8, q1032 ; 1 2 2979 shufps xm5, xm8, xm9, q1032 ; 3 4 2980 shufps xm6, xm9, xm3, q1032 ; 5 6 2981 psrldq xm11, xm3, 8 ; 7 _ 2982 punpcklwd xm0, xm7, xm4 ; 01 2983 punpckhwd xm7, xm4 ; 12 2984 punpcklwd xm1, xm8, xm5 ; 23 2985 punpckhwd xm8, xm5 ; 34 2986 punpcklwd xm2, xm9, xm6 ; 45 2987 punpckhwd xm9, xm6 ; 56 2988 punpcklwd xm3, xm11 ; 67 2989 mova [rsp+0x00], xm7 2990 mova [rsp+0x10], xm8 2991 mova [rsp+0x20], xm9 2992.w4_loop: 2993 and myd, 0x3ff 2994 mov r6d, 64 << 24 2995 mov r4d, myd 2996 shr r4d, 6 2997 lea r4d, [t1+r4] 2998 cmovnz r6q, [base+subpel_filters+r4*8] 2999 movq xm10, r6q 3000 punpcklbw xm10, xm10 3001 psraw xm10, 8 3002 pshufd xm7, xm10, q0000 3003 pshufd xm8, xm10, q1111 3004 pshufd xm9, xm10, q2222 3005 pshufd xm10, xm10, q3333 3006 pmaddwd xm4, xm0, xm7 3007 pmaddwd xm5, xm1, xm8 3008 pmaddwd xm6, xm2, xm9 3009 pmaddwd xm7, xm3, xm10 3010 paddd xm4, xm5 3011 paddd xm6, xm7 3012 paddd xm4, xm13 3013 paddd xm4, xm6 3014 psrad xm4, rndshift 3015 packssdw xm4, xm4 3016%ifidn %1, put 3017 packuswb xm4, xm4 3018 movd [dstq], xm4 3019 add dstq, dsq 3020%else 3021 movq [tmpq], xm4 3022 add tmpq, 8 3023%endif 3024 dec hd 3025 jz .ret 3026 add myd, dyd 3027 test myd, ~0x3ff 3028 jz .w4_loop 3029 movu xm4, [srcq] 3030 test myd, 0x400 3031 jz .w4_skip_line 3032 mova xm0, [rsp+0x00] 3033 mova [rsp+0x00], xm1 3034 mova xm1, [rsp+0x10] 3035 mova [rsp+0x10], xm2 3036 mova xm2, [rsp+0x20] 3037 mova [rsp+0x20], xm3 3038 pshufb xm4, xm14 3039 pmaddubsw xm4, xm15 3040 phaddw xm4, xm4 3041 pmulhrsw xm4, xm12 3042 punpcklwd xm3, xm11, xm4 3043 mova xm11, xm4 3044 add srcq, ssq 3045 jmp .w4_loop 3046.w4_skip_line: 3047 movu xm5, [srcq+ssq*1] 3048 movu m6, [rsp+0x10] 3049 pshufb xm4, xm14 3050 pshufb xm5, xm14 3051 pmaddubsw xm4, xm15 3052 pmaddubsw xm5, xm15 3053 movu [rsp+0x00], m6 3054 phaddw xm4, xm5 3055 pmulhrsw xm4, xm12 3056 punpcklwd xm9, xm11, xm4 3057 mova [rsp+0x20], xm9 3058 psrldq xm11, xm4, 8 3059 mova xm0, xm1 3060 mova xm1, xm2 3061 mova xm2, xm3 3062 punpcklwd xm3, xm4, xm11 3063 lea srcq, [srcq+ssq*2] 3064 jmp .w4_loop 3065.w8: 3066 mov dword [rsp+48], 1 3067 movifprep tmp_stridem, 16 3068 jmp .w_start 3069.w16: 3070 mov dword [rsp+48], 2 3071 movifprep tmp_stridem, 32 3072 jmp .w_start 3073.w32: 3074 mov dword [rsp+48], 4 3075 movifprep tmp_stridem, 64 3076 jmp .w_start 3077.w64: 3078 mov dword [rsp+48], 8 3079 movifprep tmp_stridem, 128 3080 jmp .w_start 3081.w128: 3082 mov dword [rsp+48], 16 3083 movifprep tmp_stridem, 256 3084.w_start: 3085%ifidn %1, put 3086 movifnidn dsm, dsq 3087%endif 3088 shr t0d, 16 3089 sub srcq, 3 3090 pmaddwd m8, [base+rescale_mul] 3091 movd xm15, t0d 3092 mov [rsp+72], t0d 3093 mov [rsp+56], srcq 3094 mov [rsp+64], r0q ; dstq / tmpq 3095%if UNIX64 3096 mov hm, hd 3097%endif 3098 shl dword dxm, 3 ; dx*8 3099 vpbroadcastd m15, xm15 3100 paddd m14, m8 ; mx+dx*[0-7] 3101 jmp .hloop 3102.hloop_prep: 3103 dec dword [rsp+48] 3104 jz .ret 3105 add qword [rsp+64], 8*(isprep+1) 3106 mov hd, hm 3107 vpbroadcastd m8, dxm 3108 vpbroadcastd m10, [base+pd_0x3ff] 3109 paddd m14, m8, [rsp+16] 3110 vpbroadcastd m15, [rsp+72] 3111 pxor m9, m9 3112 mov srcq, [rsp+56] 3113 mov r0q, [rsp+64] ; dstq / tmpq 3114.hloop: 3115 vpbroadcastq m11, [base+pq_0x40000000] 3116 pand m6, m14, m10 3117 psrld m6, 6 3118 paddd m15, m6 3119 pcmpeqd m6, m9 3120 vextracti128 xm7, m15, 1 3121 movd r4d, xm15 3122 pextrd r6d, xm15, 2 3123 pextrd r7d, xm15, 1 3124 pextrd r9d, xm15, 3 3125 movd r10d, xm7 3126 pextrd r11d, xm7, 2 3127 pextrd r13d, xm7, 1 3128 pextrd rXd, xm7, 3 3129 movu [rsp+16], m14 3130 movq xm15, [base+subpel_filters+ r4*8] 3131 movq xm10, [base+subpel_filters+ r6*8] 3132 movhps xm15, [base+subpel_filters+ r7*8] 3133 movhps xm10, [base+subpel_filters+ r9*8] 3134 vinserti128 m15, [base+subpel_filters+r10*8], 1 3135 vinserti128 m10, [base+subpel_filters+r11*8], 1 3136 vpbroadcastq m9, [base+subpel_filters+r13*8] 3137 vpbroadcastq m8, [base+subpel_filters+ rX*8] 3138 psrld m14, 10 3139 vextracti128 xm7, m14, 1 3140 mova [rsp], xm14 3141 movd r4d, xm14 3142 pextrd r6d, xm14, 2 3143 pextrd r7d, xm14, 1 3144 pextrd r9d, xm14, 3 3145 movd r10d, xm7 3146 pextrd r11d, xm7, 2 3147 pextrd r13d, xm7, 1 3148 pextrd rXd, xm7, 3 3149 pshufd m5, m6, q1100 3150 pshufd m6, m6, q3322 3151 vpblendd m15, m9, 0xc0 3152 vpblendd m10, m8, 0xc0 3153 pblendvb m15, m11, m5 3154 pblendvb m10, m11, m6 3155 vbroadcasti128 m14, [base+subpel_s_shuf8] 3156 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b 3157 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b 3158 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b 3159 MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b 3160 mov myd, mym 3161 mov dyd, dym 3162 pshufb m0, m14 ; 01a 01b 3163 pshufb m1, m14 ; 23a 23b 3164 pshufb m2, m14 ; 45a 45b 3165 pshufb m3, m14 ; 67a 67b 3166 vbroadcasti128 m14, [base+wswap] 3167.vloop: 3168 and myd, 0x3ff 3169 mov r6d, 64 << 24 3170 mov r4d, myd 3171 shr r4d, 6 3172 lea r4d, [t1+r4] 3173 cmovnz r6q, [base+subpel_filters+r4*8] 3174 movq xm11, r6q 3175 punpcklbw xm11, xm11 3176 psraw xm11, 8 3177 vinserti128 m11, xm11, 1 3178 pshufd m8, m11, q0000 3179 pshufd m9, m11, q1111 3180 pmaddwd m4, m0, m8 3181 pmaddwd m5, m1, m9 3182 pshufd m8, m11, q2222 3183 pshufd m11, m11, q3333 3184 pmaddwd m6, m2, m8 3185 pmaddwd m7, m3, m11 3186 paddd m4, m5 3187 paddd m6, m7 3188 paddd m4, m13 3189 paddd m4, m6 3190 psrad m4, rndshift 3191 vextracti128 xm5, m4, 1 3192 packssdw xm4, xm5 3193%ifidn %1, put 3194 packuswb xm4, xm4 3195 movq [dstq], xm4 3196 add dstq, dsm 3197%else 3198 mova [tmpq], xm4 3199 add tmpq, tmp_stridem 3200%endif 3201 dec hd 3202 jz .hloop_prep 3203 add myd, dyd 3204 test myd, ~0x3ff 3205 jz .vloop 3206 test myd, 0x400 3207 mov [rsp+52], myd 3208 mov r4d, [rsp+ 0] 3209 mov r6d, [rsp+ 8] 3210 mov r7d, [rsp+ 4] 3211 mov r9d, [rsp+12] 3212 jz .skip_line 3213 vpbroadcastq m6, [srcq+r13] 3214 vpbroadcastq m7, [srcq+ rX] 3215 movq xm4, [srcq+ r4] 3216 movq xm5, [srcq+ r6] 3217 movhps xm4, [srcq+ r7] 3218 movhps xm5, [srcq+ r9] 3219 vinserti128 m4, [srcq+r10], 1 3220 vinserti128 m5, [srcq+r11], 1 3221 add srcq, ssq 3222 mov myd, [rsp+52] 3223 mov dyd, dym 3224 pshufb m0, m14 3225 pshufb m1, m14 3226 pshufb m2, m14 3227 pshufb m3, m14 3228 vpblendd m4, m6, 0xc0 3229 vpblendd m5, m7, 0xc0 3230 pmaddubsw m4, m15 3231 pmaddubsw m5, m10 3232 phaddw m4, m5 3233 pslld m5, m4, 16 3234 paddw m4, m5 3235 pmulhrsw m4, m12 3236 pblendw m0, m1, 0xaa 3237 pblendw m1, m2, 0xaa 3238 pblendw m2, m3, 0xaa 3239 pblendw m3, m4, 0xaa 3240 jmp .vloop 3241.skip_line: 3242 mova m0, m1 3243 mova m1, m2 3244 mova m2, m3 3245 vpbroadcastq m7, [srcq+r13] 3246 vpbroadcastq m8, [srcq+ rX] 3247 movq xm3, [srcq+ r4] 3248 movq xm4, [srcq+ r6] 3249 movhps xm3, [srcq+ r7] 3250 movhps xm4, [srcq+ r9] 3251 vinserti128 m3, [srcq+r10], 1 3252 vinserti128 m4, [srcq+r11], 1 3253 add srcq, ssq 3254 movq xm5, [srcq+ r4] 3255 movq xm6, [srcq+ r6] 3256 movhps xm5, [srcq+ r7] 3257 movhps xm6, [srcq+ r9] 3258 vinserti128 m5, [srcq+r10], 1 3259 vinserti128 m6, [srcq+r11], 1 3260 vpbroadcastq m9, [srcq+r13] 3261 vpbroadcastq m11, [srcq+ rX] 3262 add srcq, ssq 3263 mov myd, [rsp+52] 3264 mov dyd, dym 3265 vpblendd m3, m7, 0xc0 3266 vpblendd m4, m8, 0xc0 3267 vpblendd m5, m9, 0xc0 3268 vpblendd m6, m11, 0xc0 3269 pmaddubsw m3, m15 3270 pmaddubsw m4, m10 3271 pmaddubsw m5, m15 3272 pmaddubsw m6, m10 3273 phaddw m3, m4 3274 phaddw m5, m6 3275 psrld m4, m3, 16 3276 pslld m6, m5, 16 3277 paddw m3, m4 3278 paddw m5, m6 3279 pblendw m3, m5, 0xaa 3280 pmulhrsw m3, m12 3281 jmp .vloop 3282.dy1: 3283 movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] 3284 add wq, base_reg 3285 jmp wq 3286%ifidn %1, put 3287.dy1_w2: 3288 mov myd, mym 3289 movzx t0d, t0b 3290 dec srcq 3291 movd xm15, t0d 3292 punpckldq m8, m9, m8 3293 paddd m14, m8 ; mx+dx*[0-1] 3294 vpbroadcastd m11, [base+pd_0x4000] 3295 vpbroadcastd xm15, xm15 3296 pand m8, m14, m10 3297 psrld m8, 6 3298 paddd xm15, xm8 3299 movd r4d, xm15 3300 pextrd r6d, xm15, 1 3301 vbroadcasti128 m5, [base+bdct_lb_dw] 3302 vbroadcasti128 m6, [base+subpel_s_shuf2] 3303 vpbroadcastd m15, [base+subpel_filters+r4*8+2] 3304 vpbroadcastd m7, [base+subpel_filters+r6*8+2] 3305 pcmpeqd m8, m9 3306 psrld m14, 10 3307 movq xm0, [srcq+ssq*0] 3308 movq xm1, [srcq+ssq*2] 3309 movhps xm0, [srcq+ssq*1] 3310 movhps xm1, [srcq+ss3q ] 3311 lea srcq, [srcq+ssq*4] 3312 shr myd, 6 3313 mov r4d, 64 << 24 3314 lea myd, [t1+myq] 3315 cmovnz r4q, [base+subpel_filters+myq*8] 3316 pshufb m14, m5 3317 paddb m14, m6 3318 vinserti128 m0, [srcq+ssq*0], 1 3319 vinserti128 m1, [srcq+ssq*2], 1 3320 vpbroadcastq m2, [srcq+ssq*1] 3321 add srcq, ss3q 3322 movq xm10, r4q 3323 punpcklbw xm10, xm10 3324 psraw xm10, 8 3325 vpblendd m15, m7, 0xaa 3326 pblendvb m15, m11, m8 3327 pshufd xm8, xm10, q0000 3328 pshufd xm9, xm10, q1111 3329 pshufd xm11, xm10, q3333 3330 pshufd xm10, xm10, q2222 3331 vpblendd m0, m2, 0xc0 3332 pshufb m1, m14 3333 pshufb m0, m14 3334 pmaddubsw m1, m15 3335 pmaddubsw m0, m15 3336 phaddw m0, m1 3337 pmulhrsw m0, m12 3338 vextracti128 xm1, m0, 1 3339 palignr xm2, xm1, xm0, 4 3340 pshufd xm4, xm1, q2121 3341 punpcklwd xm3, xm0, xm2 ; 01 12 3342 punpckhwd xm0, xm2 ; 23 34 3343 punpcklwd xm2, xm1, xm4 ; 45 56 3344.dy1_w2_loop: 3345 movq xm1, [srcq+ssq*0] 3346 movhps xm1, [srcq+ssq*1] 3347 lea srcq, [srcq+ssq*2] 3348 pmaddwd xm5, xm3, xm8 3349 pmaddwd xm6, xm0, xm9 3350 pmaddwd xm7, xm2, xm10 3351 mova xm3, xm0 3352 mova xm0, xm2 3353 paddd xm5, xm13 3354 paddd xm6, xm7 3355 pshufb xm1, xm14 3356 pmaddubsw xm1, xm15 3357 phaddw xm1, xm1 3358 pmulhrsw xm1, xm12 3359 palignr xm7, xm1, xm4, 12 3360 punpcklwd xm2, xm7, xm1 ; 67 78 3361 pmaddwd xm7, xm2, xm11 3362 mova xm4, xm1 3363 paddd xm5, xm6 3364 paddd xm5, xm7 3365 psrad xm5, rndshift 3366 packssdw xm5, xm5 3367 packuswb xm5, xm5 3368 pextrw [dstq+dsq*0], xm5, 0 3369 pextrw [dstq+dsq*1], xm5, 1 3370 lea dstq, [dstq+dsq*2] 3371 sub hd, 2 3372 jg .dy1_w2_loop 3373 RET 3374%endif 3375.dy1_w4: 3376 mov myd, mym 3377 vbroadcasti128 m7, [base+rescale_mul] 3378 movzx t0d, t0b 3379 dec srcq 3380 movd xm15, t0d 3381 pmaddwd m8, m7 3382 vpbroadcastd m11, [base+pd_0x4000] 3383 vpbroadcastd xm15, xm15 3384 paddd m14, m8 ; mx+dx*[0-3] 3385 pand m8, m14, m10 3386 psrld m8, 6 3387 paddd xm15, xm8 3388 vpermq m8, m8, q3120 3389 movd r4d, xm15 3390 pextrd r6d, xm15, 2 3391 pextrd r11d, xm15, 1 3392 pextrd r13d, xm15, 3 3393 movd xm15, [base+subpel_filters+r4*8+2] 3394 vpbroadcastd m7, [base+subpel_filters+r6*8+2] 3395 movu xm2, [srcq+ssq*0] 3396 movu xm3, [srcq+ssq*2] 3397 vbroadcasti128 m5, [base+bdct_lb_dw] 3398 vpbroadcastq m6, [base+subpel_s_shuf2] 3399 pcmpeqd m8, m9 3400 psrld m14, 10 3401 pinsrd xm15, [base+subpel_filters+r11*8+2], 1 3402 vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20 3403 vinserti128 m2, [srcq+ssq*1], 1 3404 vinserti128 m3, [srcq+ss3q ], 1 3405 lea srcq, [srcq+ssq*4] 3406 shr myd, 6 3407 mov r4d, 64 << 24 3408 lea myd, [t1+myq] 3409 cmovnz r4q, [base+subpel_filters+myq*8] 3410 pshufb m14, m5 3411 paddb m14, m6 3412 movu xm4, [srcq+ssq*0] 3413 movu xm5, [srcq+ssq*2] 3414 vinserti128 m4, [srcq+ssq*1], 1 3415 add srcq, ss3q 3416 vpblendd m15, m7, 0x30 3417 punpcklqdq m15, m15 3418 pblendvb m15, m11, m8 3419 movq xm10, r4q 3420 punpcklbw xm10, xm10 3421 psraw xm10, 8 3422 vinserti128 m10, xm10, 1 3423 pshufb m2, m14 3424 pshufb m3, m14 3425 pshufb m4, m14 3426 pshufb xm5, xm14 3427 vpermq m2, m2, q3120 3428 vpermq m3, m3, q3120 3429 vpermq m4, m4, q3120 3430 vpermq m5, m5, q3120 3431 pshufd m7, m10, q0000 3432 pshufd m8, m10, q1111 3433 pshufd m9, m10, q2222 3434 pshufd m10, m10, q3333 3435 pmaddubsw m2, m15 3436 pmaddubsw m3, m15 3437 pmaddubsw m4, m15 3438 pmaddubsw m5, m15 3439 phaddw m2, m3 3440 phaddw m4, m5 3441 pmulhrsw m2, m12 3442 pmulhrsw m4, m12 3443 palignr m5, m4, m2, 4 3444 pshufd m3, m4, q2121 3445 punpcklwd m0, m2, m5 ; 01 12 3446 punpckhwd m1, m2, m5 ; 23 34 3447 punpcklwd m2, m4, m3 ; 45 56 3448.dy1_w4_loop: 3449 movu xm11, [srcq+ssq*0] 3450 vinserti128 m11, [srcq+ssq*1], 1 3451 lea srcq, [srcq+ssq*2] 3452 pmaddwd m4, m0, m7 3453 pmaddwd m5, m1, m8 3454 pmaddwd m6, m2, m9 3455 mova m0, m1 3456 mova m1, m2 3457 paddd m4, m13 3458 paddd m5, m6 3459 pshufb m11, m14 3460 vpermq m11, m11, q3120 3461 pmaddubsw m11, m15 3462 phaddw m11, m11 3463 pmulhrsw m11, m12 3464 palignr m6, m11, m3, 12 3465 punpcklwd m2, m6, m11 ; 67 78 3466 mova m3, m11 3467 pmaddwd m6, m2, m10 3468 paddd m4, m5 3469 paddd m4, m6 3470 psrad m4, rndshift 3471 vextracti128 xm5, m4, 1 3472 packssdw xm4, xm5 3473%ifidn %1, put 3474 packuswb xm4, xm4 3475 pshuflw xm4, xm4, q3120 3476 movd [dstq+dsq*0], xm4 3477 pextrd [dstq+dsq*1], xm4, 1 3478 lea dstq, [dstq+dsq*2] 3479%else 3480 pshufd xm4, xm4, q3120 3481 mova [tmpq], xm4 3482 add tmpq, 16 3483%endif 3484 sub hd, 2 3485 jg .dy1_w4_loop 3486 MC_8TAP_SCALED_RET 3487.dy1_w8: 3488 mov dword [rsp+72], 1 3489 movifprep tmp_stridem, 16 3490 jmp .dy1_w_start 3491.dy1_w16: 3492 mov dword [rsp+72], 2 3493 movifprep tmp_stridem, 32 3494 jmp .dy1_w_start 3495.dy1_w32: 3496 mov dword [rsp+72], 4 3497 movifprep tmp_stridem, 64 3498 jmp .dy1_w_start 3499.dy1_w64: 3500 mov dword [rsp+72], 8 3501 movifprep tmp_stridem, 128 3502 jmp .dy1_w_start 3503.dy1_w128: 3504 mov dword [rsp+72], 16 3505 movifprep tmp_stridem, 256 3506.dy1_w_start: 3507 mov myd, mym 3508%ifidn %1, put 3509 movifnidn dsm, dsq 3510%endif 3511 shr t0d, 16 3512 sub srcq, 3 3513 shr myd, 6 3514 mov r4d, 64 << 24 3515 lea myd, [t1+myq] 3516 cmovnz r4q, [base+subpel_filters+myq*8] 3517 pmaddwd m8, [base+rescale_mul] 3518 movd xm15, t0d 3519 mov [rsp+76], t0d 3520 mov [rsp+80], srcq 3521 mov [rsp+88], r0q ; dstq / tmpq 3522%if UNIX64 3523 mov hm, hd 3524%endif 3525 shl dword dxm, 3 ; dx*8 3526 vpbroadcastd m15, xm15 3527 paddd m14, m8 ; mx+dx*[0-7] 3528 movq xm0, r4q 3529 punpcklbw xm0, xm0 3530 psraw xm0, 8 3531 mova [rsp+96], xm0 3532 jmp .dy1_hloop 3533.dy1_hloop_prep: 3534 dec dword [rsp+72] 3535 jz .ret 3536 add qword [rsp+88], 8*(isprep+1) 3537 mov hd, hm 3538 vpbroadcastd m8, dxm 3539 vpbroadcastd m10, [base+pd_0x3ff] 3540 paddd m14, m8, [rsp+32] 3541 vpbroadcastd m15, [rsp+76] 3542 pxor m9, m9 3543 mov srcq, [rsp+80] 3544 mov r0q, [rsp+88] ; dstq / tmpq 3545.dy1_hloop: 3546 vpbroadcastq m11, [base+pq_0x40000000] 3547 pand m6, m14, m10 3548 psrld m6, 6 3549 paddd m15, m6 3550 pcmpeqd m6, m9 3551 vextracti128 xm7, m15, 1 3552 movd r4d, xm15 3553 pextrd r6d, xm15, 2 3554 pextrd r7d, xm15, 1 3555 pextrd r9d, xm15, 3 3556 movd r10d, xm7 3557 pextrd r11d, xm7, 2 3558 pextrd r13d, xm7, 1 3559 pextrd rXd, xm7, 3 3560 movu [rsp+32], m14 3561 movq xm15, [base+subpel_filters+ r4*8] 3562 movq xm10, [base+subpel_filters+ r6*8] 3563 movhps xm15, [base+subpel_filters+ r7*8] 3564 movhps xm10, [base+subpel_filters+ r9*8] 3565 vinserti128 m15, [base+subpel_filters+r10*8], 1 3566 vinserti128 m10, [base+subpel_filters+r11*8], 1 3567 vpbroadcastq m9, [base+subpel_filters+r13*8] 3568 vpbroadcastq m8, [base+subpel_filters+ rX*8] 3569 psrld m14, 10 3570 vextracti128 xm7, m14, 1 3571 movq [rsp+64], xm14 3572 movd r4d, xm14 3573 pextrd r6d, xm14, 2 3574 pextrd r7d, xm14, 1 3575 pextrd r9d, xm14, 3 3576 movd r10d, xm7 3577 pextrd r11d, xm7, 2 3578 pextrd r13d, xm7, 1 3579 pextrd rXd, xm7, 3 3580 pshufd m5, m6, q1100 3581 pshufd m6, m6, q3322 3582 vpblendd m15, m9, 0xc0 3583 vpblendd m10, m8, 0xc0 3584 pblendvb m15, m11, m5 3585 pblendvb m10, m11, m6 3586 vbroadcasti128 m14, [base+subpel_s_shuf8] 3587 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b 3588 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b 3589 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b 3590 MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b 3591 movu [rsp], m10 3592 vpbroadcastd m8, [rsp+0x60] 3593 vpbroadcastd m9, [rsp+0x64] 3594 vpbroadcastd m10, [rsp+0x68] 3595 vpbroadcastd m11, [rsp+0x6c] 3596 pshufb m0, m14 ; 01a 01b 3597 pshufb m1, m14 ; 23a 23b 3598 pshufb m2, m14 ; 45a 45b 3599 pshufb m3, m14 ; 67a 67b 3600 vbroadcasti128 m14, [base+wswap] 3601.dy1_vloop: 3602 pmaddwd m4, m0, m8 3603 pmaddwd m5, m1, m9 3604 pmaddwd m6, m2, m10 3605 pmaddwd m7, m3, m11 3606 paddd m4, m5 3607 paddd m6, m7 3608 paddd m4, m13 3609 paddd m4, m6 3610 psrad m4, rndshift 3611 vextracti128 xm5, m4, 1 3612 packssdw xm4, xm5 3613%ifidn %1, put 3614 packuswb xm4, xm4 3615 movq [dstq], xm4 3616 add dstq, dsm 3617%else 3618 mova [tmpq], xm4 3619 add tmpq, tmp_stridem 3620%endif 3621 dec hd 3622 jz .dy1_hloop_prep 3623 movq xm4, [srcq+ r4] 3624 movq xm5, [srcq+ r6] 3625 movhps xm4, [srcq+ r7] 3626 movhps xm5, [srcq+ r9] 3627 vinserti128 m4, [srcq+r10], 1 3628 vinserti128 m5, [srcq+r11], 1 3629 vpbroadcastq m6, [srcq+r13] 3630 vpbroadcastq m7, [srcq+ rX] 3631 add srcq, ssq 3632 pshufb m0, m14 3633 pshufb m1, m14 3634 pshufb m2, m14 3635 pshufb m3, m14 3636 vpblendd m4, m6, 0xc0 3637 vpblendd m5, m7, 0xc0 3638 pmaddubsw m4, m15 3639 pmaddubsw m5, [rsp] 3640 phaddw m4, m5 3641 pslld m5, m4, 16 3642 paddw m4, m5 3643 pmulhrsw m4, m12 3644 pblendw m0, m1, 0xaa 3645 pblendw m1, m2, 0xaa 3646 pblendw m2, m3, 0xaa 3647 pblendw m3, m4, 0xaa 3648 jmp .dy1_vloop 3649.dy2: 3650 movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] 3651 add wq, base_reg 3652 jmp wq 3653%ifidn %1, put 3654.dy2_w2: 3655 mov myd, mym 3656 movzx t0d, t0b 3657 dec srcq 3658 movd xm15, t0d 3659 punpckldq m8, m9, m8 3660 paddd m14, m8 ; mx+dx*[0-1] 3661 vpbroadcastd m11, [base+pd_0x4000] 3662 vpbroadcastd xm15, xm15 3663 pand m8, m14, m10 3664 psrld m8, 6 3665 paddd xm15, xm8 3666 movd r4d, xm15 3667 pextrd r6d, xm15, 1 3668 vbroadcasti128 m5, [base+bdct_lb_dw] 3669 vbroadcasti128 m6, [base+subpel_s_shuf2] 3670 vpbroadcastd m15, [base+subpel_filters+r4*8+2] 3671 vpbroadcastd m7, [base+subpel_filters+r6*8+2] 3672 pcmpeqd m8, m9 3673 psrld m14, 10 3674 movq xm0, [srcq+ssq*0] 3675 vpbroadcastq m2, [srcq+ssq*1] 3676 movhps xm0, [srcq+ssq*2] 3677 vpbroadcastq m3, [srcq+ss3q ] 3678 lea srcq, [srcq+ssq*4] 3679 pshufb m14, m5 3680 paddb m14, m6 3681 vpblendd m15, m7, 0xaa 3682 pblendvb m15, m11, m8 3683 movhps xm1, [srcq+ssq*0] 3684 vpbroadcastq m4, [srcq+ssq*1] 3685 lea srcq, [srcq+ssq*2] 3686 shr myd, 6 3687 mov r4d, 64 << 24 3688 lea myd, [t1+myq] 3689 cmovnz r4q, [base+subpel_filters+myq*8] 3690 vpblendd m0, m2, 0x30 3691 vpblendd m1, m4, 0xc0 3692 vpblendd m0, m3, 0xc0 3693 pshufb m0, m14 3694 pshufb m1, m14 3695 pmaddubsw m0, m15 3696 pmaddubsw m1, m15 3697 movq xm11, r4q 3698 punpcklbw xm11, xm11 3699 psraw xm11, 8 3700 phaddw m0, m1 3701 pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5 3702 pshufd xm8, xm11, q0000 3703 pshufd xm9, xm11, q1111 3704 pshufd xm10, xm11, q2222 3705 pshufd xm11, xm11, q3333 3706 pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5 3707 vextracti128 xm1, m2, 1 3708 punpcklwd xm3, xm2, xm1 ; 01 23 3709 punpckhwd xm2, xm1 ; 23 45 3710.dy2_w2_loop: 3711 movq xm6, [srcq+ssq*0] 3712 vpbroadcastq m7, [srcq+ssq*1] 3713 movhps xm6, [srcq+ssq*2] 3714 vpbroadcastq m1, [srcq+ss3q ] 3715 lea srcq, [srcq+ssq*4] 3716 pmaddwd xm4, xm3, xm8 3717 pmaddwd xm5, xm2, xm9 3718 vpblendd m6, m7, 0x30 3719 vpblendd m6, m1, 0xc0 3720 pshufb m6, m14 3721 pmaddubsw m6, m15 3722 phaddw m6, m6 3723 pmulhrsw m6, m12 3724 palignr m0, m6, m0, 8 3725 pshufd m2, m0, q3221 3726 vextracti128 xm1, m2, 1 3727 punpcklwd xm3, xm2, xm1 ; 45 67 3728 punpckhwd xm2, xm1 ; 67 89 3729 pmaddwd xm6, xm3, xm10 3730 pmaddwd xm7, xm2, xm11 3731 paddd xm4, xm5 3732 paddd xm4, xm13 3733 paddd xm6, xm7 3734 paddd xm4, xm6 3735 psrad xm4, rndshift 3736 packssdw xm4, xm4 3737 packuswb xm4, xm4 3738 pextrw [dstq+dsq*0], xm4, 0 3739 pextrw [dstq+dsq*1], xm4, 1 3740 lea dstq, [dstq+dsq*2] 3741 sub hd, 2 3742 jg .dy2_w2_loop 3743 RET 3744%endif 3745.dy2_w4: 3746 mov myd, mym 3747 vbroadcasti128 m7, [base+rescale_mul] 3748 movzx t0d, t0b 3749 dec srcq 3750 movd xm15, t0d 3751 pmaddwd m8, m7 3752 vpbroadcastd m11, [base+pd_0x4000] 3753 vpbroadcastd xm15, xm15 3754 paddd m14, m8 ; mx+dx*[0-3] 3755 pand m8, m14, m10 3756 psrld m8, 6 3757 paddd xm15, xm8 3758 movd r4d, xm15 3759 pextrd r6d, xm15, 1 3760 pextrd r11d, xm15, 2 3761 pextrd r13d, xm15, 3 3762 movd xm15, [base+subpel_filters+r4*8+2] 3763 vbroadcasti128 m5, [base+bdct_lb_dw] 3764 vpbroadcastq m6, [base+subpel_s_shuf2] 3765 pinsrd xm15, [base+subpel_filters+r6*8+2], 1 3766 pcmpeqd m8, m9 3767 psrld m14, 10 3768 movu xm0, [srcq+ssq*0] 3769 movu xm2, [srcq+ssq*2] 3770 pinsrd xm15, [base+subpel_filters+r11*8+2], 2 3771 movu xm1, [srcq+ssq*1] 3772 movu xm3, [srcq+ss3q ] 3773 pinsrd xm15, [base+subpel_filters+r13*8+2], 3 3774 lea srcq, [srcq+ssq*4] 3775 shr myd, 6 3776 mov r4d, 64 << 24 3777 lea myd, [t1+myq] 3778 cmovnz r4q, [base+subpel_filters+myq*8] 3779 vinserti128 m15, xm15, 1 3780 pshufb m14, m5 3781 paddb m14, m6 3782 vinserti128 m2, [srcq+ssq*0], 1 3783 vinserti128 m3, [srcq+ssq*1], 1 3784 lea srcq, [srcq+ssq*2] 3785 pblendvb m15, m11, m8 3786 pshufb xm0, xm14 3787 pshufb m2, m14 3788 pshufb xm1, xm14 3789 pshufb m3, m14 3790 pmaddubsw xm0, xm15 3791 pmaddubsw m2, m15 3792 pmaddubsw xm1, xm15 3793 pmaddubsw m3, m15 3794 movq xm11, r4q 3795 punpcklbw xm11, xm11 3796 psraw xm11, 8 3797 vinserti128 m11, xm11, 1 3798 phaddw m0, m2 3799 phaddw m1, m3 3800 pmulhrsw m0, m12 ; 0 2 _ 4 3801 pmulhrsw m1, m12 ; 1 3 _ 5 3802 pshufd m8, m11, q0000 3803 pshufd m9, m11, q1111 3804 pshufd m10, m11, q2222 3805 pshufd m11, m11, q3333 3806 punpcklwd xm2, xm0, xm1 3807 punpckhwd m1, m0, m1 ; 23 45 3808 vinserti128 m0, m2, xm1, 1 ; 01 23 3809.dy2_w4_loop: 3810 movu xm6, [srcq+ssq*0] 3811 movu xm7, [srcq+ssq*1] 3812 vinserti128 m6, [srcq+ssq*2], 1 3813 vinserti128 m7, [srcq+ss3q ], 1 3814 lea srcq, [srcq+ssq*4] 3815 pmaddwd m4, m0, m8 3816 pmaddwd m5, m1, m9 3817 pshufb m6, m14 3818 pshufb m7, m14 3819 pmaddubsw m6, m15 3820 pmaddubsw m7, m15 3821 psrld m2, m6, 16 3822 pslld m3, m7, 16 3823 paddw m6, m2 3824 paddw m7, m3 3825 pblendw m6, m7, 0xaa ; 67 89 3826 pmulhrsw m6, m12 3827 paddd m4, m5 3828 vpblendd m0, m1, m6, 0x0f 3829 mova m1, m6 3830 vpermq m0, m0, q1032 ; 45 67 3831 pmaddwd m6, m0, m10 3832 pmaddwd m7, m1, m11 3833 paddd m4, m13 3834 paddd m6, m7 3835 paddd m4, m6 3836 psrad m4, rndshift 3837 vextracti128 xm5, m4, 1 3838 packssdw xm4, xm5 3839%ifidn %1, put 3840 packuswb xm4, xm4 3841 movd [dstq+dsq*0], xm4 3842 pextrd [dstq+dsq*1], xm4, 1 3843 lea dstq, [dstq+dsq*2] 3844%else 3845 mova [tmpq], xm4 3846 add tmpq, 16 3847%endif 3848 sub hd, 2 3849 jg .dy2_w4_loop 3850 MC_8TAP_SCALED_RET 3851.dy2_w8: 3852 mov dword [rsp+40], 1 3853 movifprep tmp_stridem, 16 3854 jmp .dy2_w_start 3855.dy2_w16: 3856 mov dword [rsp+40], 2 3857 movifprep tmp_stridem, 32 3858 jmp .dy2_w_start 3859.dy2_w32: 3860 mov dword [rsp+40], 4 3861 movifprep tmp_stridem, 64 3862 jmp .dy2_w_start 3863.dy2_w64: 3864 mov dword [rsp+40], 8 3865 movifprep tmp_stridem, 128 3866 jmp .dy2_w_start 3867.dy2_w128: 3868 mov dword [rsp+40], 16 3869 movifprep tmp_stridem, 256 3870.dy2_w_start: 3871 mov myd, mym 3872%ifidn %1, put 3873 movifnidn dsm, dsq 3874%endif 3875 shr t0d, 16 3876 sub srcq, 3 3877 shr myd, 6 3878 mov r4d, 64 << 24 3879 lea myd, [t1+myq] 3880 cmovnz r4q, [base+subpel_filters+myq*8] 3881 pmaddwd m8, [base+rescale_mul] 3882 movd xm15, t0d 3883 mov [rsp+64], t0d 3884 mov [rsp+48], srcq 3885 mov [rsp+56], r0q ; dstq / tmpq 3886%if UNIX64 3887 mov hm, hd 3888%endif 3889 shl dword dxm, 3 ; dx*8 3890 vpbroadcastd m15, xm15 3891 paddd m14, m8 ; mx+dx*[0-7] 3892 movq xm0, r4q 3893 punpcklbw xm0, xm0 3894 psraw xm0, 8 3895 mova [rsp+0x50], xm0 3896 jmp .dy2_hloop 3897.dy2_hloop_prep: 3898 dec dword [rsp+40] 3899 jz .ret 3900 add qword [rsp+56], 8*(isprep+1) 3901 mov hd, hm 3902 vpbroadcastd m8, dxm 3903 vpbroadcastd m10, [base+pd_0x3ff] 3904 paddd m14, m8, [rsp] 3905 vpbroadcastd m15, [rsp+64] 3906 pxor m9, m9 3907 mov srcq, [rsp+48] 3908 mov r0q, [rsp+56] ; dstq / tmpq 3909.dy2_hloop: 3910 vpbroadcastq m11, [base+pq_0x40000000] 3911 pand m6, m14, m10 3912 psrld m6, 6 3913 paddd m15, m6 3914 pcmpeqd m6, m9 3915 vextracti128 xm7, m15, 1 3916 movd r4d, xm15 3917 pextrd r6d, xm15, 2 3918 pextrd r7d, xm15, 1 3919 pextrd r9d, xm15, 3 3920 movd r10d, xm7 3921 pextrd r11d, xm7, 2 3922 pextrd r13d, xm7, 1 3923 pextrd rXd, xm7, 3 3924 movu [rsp], m14 3925 movq xm15, [base+subpel_filters+ r4*8] 3926 movq xm10, [base+subpel_filters+ r6*8] 3927 movhps xm15, [base+subpel_filters+ r7*8] 3928 movhps xm10, [base+subpel_filters+ r9*8] 3929 vinserti128 m15, [base+subpel_filters+r10*8], 1 3930 vinserti128 m10, [base+subpel_filters+r11*8], 1 3931 vpbroadcastq m9, [base+subpel_filters+r13*8] 3932 vpbroadcastq m8, [base+subpel_filters+ rX*8] 3933 psrld m14, 10 3934 vextracti128 xm7, m14, 1 3935 movd r4d, xm14 3936 pextrd r6d, xm14, 2 3937 pextrd r7d, xm14, 1 3938 pextrd r9d, xm14, 3 3939 movd r10d, xm7 3940 pextrd r11d, xm7, 2 3941 pextrd r13d, xm7, 1 3942 pextrd rXd, xm7, 3 3943 pshufd m5, m6, q1100 3944 pshufd m6, m6, q3322 3945 vpblendd m15, m9, 0xc0 3946 vpblendd m10, m8, 0xc0 3947 pblendvb m15, m11, m5 3948 pblendvb m10, m11, m6 3949 vbroadcasti128 m14, [base+subpel_s_shuf8] 3950 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b 3951 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b 3952 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b 3953 MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b 3954 vpbroadcastd m8, [rsp+0x50] 3955 vpbroadcastd m9, [rsp+0x54] 3956 vpbroadcastd m11, [rsp+0x58] 3957 vpbroadcastd m4, [rsp+0x5c] 3958 pshufb m0, m14 ; 01a 01b 3959 pshufb m1, m14 ; 23a 23b 3960 pshufb m2, m14 ; 45a 45b 3961 pshufb m3, m14 ; 67a 67b 3962 SWAP m14, m4 3963.dy2_vloop: 3964 pmaddwd m4, m0, m8 3965 pmaddwd m5, m1, m9 3966 pmaddwd m6, m2, m11 3967 pmaddwd m7, m3, m14 3968 paddd m4, m5 3969 paddd m6, m7 3970 paddd m4, m13 3971 paddd m4, m6 3972 psrad m4, rndshift 3973 vextracti128 xm5, m4, 1 3974 packssdw xm4, xm5 3975%ifidn %1, put 3976 packuswb xm4, xm4 3977 movq [dstq], xm4 3978 add dstq, dsm 3979%else 3980 mova [tmpq], xm4 3981 add tmpq, tmp_stridem 3982%endif 3983 dec hd 3984 jz .dy2_hloop_prep 3985 mova m0, m1 3986 mova m1, m2 3987 mova m2, m3 3988 movq xm3, [srcq+ r4] 3989 movq xm4, [srcq+ r6] 3990 movhps xm3, [srcq+ r7] 3991 movhps xm4, [srcq+ r9] 3992 vinserti128 m3, [srcq+r10], 1 3993 vinserti128 m4, [srcq+r11], 1 3994 vpbroadcastq m5, [srcq+r13] 3995 vpbroadcastq m6, [srcq+ rX] 3996 add srcq, ssq 3997 vpblendd m3, m5, 0xc0 3998 vpblendd m4, m6, 0xc0 3999 pmaddubsw m3, m15 4000 pmaddubsw m4, m10 4001 phaddw m3, m4 4002 movq xm4, [srcq+ r4] 4003 movq xm5, [srcq+ r6] 4004 movhps xm4, [srcq+ r7] 4005 movhps xm5, [srcq+ r9] 4006 vinserti128 m4, [srcq+r10], 1 4007 vinserti128 m5, [srcq+r11], 1 4008 vpbroadcastq m6, [srcq+r13] 4009 vpbroadcastq m7, [srcq+ rX] 4010 add srcq, ssq 4011 vpblendd m4, m6, 0xc0 4012 vpblendd m5, m7, 0xc0 4013 pmaddubsw m4, m15 4014 pmaddubsw m5, m10 4015 phaddw m4, m5 4016 psrld m5, m3, 16 4017 pslld m6, m4, 16 4018 paddw m3, m5 4019 paddw m4, m6 4020 pblendw m3, m4, 0xaa 4021 pmulhrsw m3, m12 4022 jmp .dy2_vloop 4023.ret: 4024 MC_8TAP_SCALED_RET 0 4025%undef isprep 4026%endmacro 4027 4028%macro BILIN_SCALED_FN 1 4029cglobal %1_bilin_scaled 4030 mov t0d, (5*15 << 16) | 5*15 4031 mov t1d, t0d 4032 jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX) 4033%endmacro 4034 4035%if WIN64 4036DECLARE_REG_TMP 6, 5 4037%else 4038DECLARE_REG_TMP 6, 8 4039%endif 4040 4041%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, 4042%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, 4043 4044BILIN_SCALED_FN put 4045PUT_8TAP_SCALED_FN sharp, SHARP, SHARP 4046PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH 4047PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP 4048PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH 4049PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR 4050PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP 4051PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR 4052PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH 4053PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR 4054MC_8TAP_SCALED put 4055 4056%if WIN64 4057DECLARE_REG_TMP 5, 4 4058%else 4059DECLARE_REG_TMP 6, 7 4060%endif 4061 4062BILIN_SCALED_FN prep 4063PREP_8TAP_SCALED_FN sharp, SHARP, SHARP 4064PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH 4065PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP 4066PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH 4067PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR 4068PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP 4069PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR 4070PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH 4071PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR 4072MC_8TAP_SCALED prep 4073 4074%macro WARP_V 5 ; dst, 02, 46, 13, 57 4075 ; Can be done using gathers, but that's terribly slow on many CPU:s 4076 lea tmp1d, [myq+deltaq*4] 4077 lea tmp2d, [myq+deltaq*1] 4078 shr myd, 10 4079 shr tmp1d, 10 4080 movq xm8, [filterq+myq *8] 4081 vinserti128 m8, [filterq+tmp1q*8], 1 ; a e 4082 lea tmp1d, [tmp2q+deltaq*4] 4083 lea myd, [tmp2q+deltaq*1] 4084 shr tmp2d, 10 4085 shr tmp1d, 10 4086 movq xm0, [filterq+tmp2q*8] 4087 vinserti128 m0, [filterq+tmp1q*8], 1 ; b f 4088 lea tmp1d, [myq+deltaq*4] 4089 lea tmp2d, [myq+deltaq*1] 4090 shr myd, 10 4091 shr tmp1d, 10 4092 movq xm9, [filterq+myq *8] 4093 vinserti128 m9, [filterq+tmp1q*8], 1 ; c g 4094 lea tmp1d, [tmp2q+deltaq*4] 4095 lea myd, [tmp2q+gammaq] ; my += gamma 4096 shr tmp2d, 10 4097 shr tmp1d, 10 4098 punpcklwd m8, m0 4099 movq xm0, [filterq+tmp2q*8] 4100 vinserti128 m0, [filterq+tmp1q*8], 1 ; d h 4101 punpcklwd m0, m9, m0 4102 punpckldq m9, m8, m0 4103 punpckhdq m0, m8, m0 4104 punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 4105 punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 4106 pmaddwd m%2, m8 4107 pmaddwd m9, m%3 4108 punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 4109 punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 4110 pmaddwd m8, m%4 4111 pmaddwd m0, m%5 4112 paddd m%2, m9 4113 paddd m0, m8 4114 paddd m%1, m0, m%2 4115%endmacro 4116 4117cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts 4118%if WIN64 4119 sub rsp, 0xa0 4120%endif 4121 call mangle(private_prefix %+ _warp_affine_8x8_avx2).main 4122.loop: 4123 psrad m7, 13 4124 psrad m0, 13 4125 packssdw m7, m0 4126 pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7 4127 vpermq m7, m7, q3120 4128 mova [tmpq+tsq*0], xm7 4129 vextracti128 [tmpq+tsq*2], m7, 1 4130 dec r4d 4131 jz mangle(private_prefix %+ _warp_affine_8x8_avx2).end 4132 call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2 4133 lea tmpq, [tmpq+tsq*4] 4134 jmp .loop 4135 4136cglobal warp_affine_8x8, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \ 4137 beta, filter, tmp1, delta, my, gamma 4138%if WIN64 4139 sub rsp, 0xa0 4140 %assign xmm_regs_used 16 4141 %assign stack_size_padded 0xa0 4142 %assign stack_offset stack_offset+stack_size_padded 4143%endif 4144 call .main 4145 jmp .start 4146.loop: 4147 call .main2 4148 lea dstq, [dstq+dsq*2] 4149.start: 4150 psrad m7, 18 4151 psrad m0, 18 4152 packusdw m7, m0 4153 pavgw m7, m11 ; (x + (1 << 10)) >> 11 4154 vextracti128 xm0, m7, 1 4155 packuswb xm7, xm0 4156 pshufd xm7, xm7, q3120 4157 movq [dstq+dsq*0], xm7 4158 movhps [dstq+dsq*1], xm7 4159 dec r4d 4160 jg .loop 4161.end: 4162 RET 4163ALIGN function_align 4164.main: 4165 ; Stack args offset by one (r4m -> r5m etc.) due to call 4166%if WIN64 4167 mov abcdq, r5m 4168 mov mxd, r6m 4169 movaps [rsp+stack_offset+0x10], xmm6 4170 movaps [rsp+stack_offset+0x20], xmm7 4171 movaps [rsp+0x28], xmm8 4172 movaps [rsp+0x38], xmm9 4173 movaps [rsp+0x48], xmm10 4174 movaps [rsp+0x58], xmm11 4175 movaps [rsp+0x68], xmm12 4176 movaps [rsp+0x78], xmm13 4177 movaps [rsp+0x88], xmm14 4178 movaps [rsp+0x98], xmm15 4179%endif 4180 movsx alphad, word [abcdq+2*0] 4181 movsx betad, word [abcdq+2*1] 4182 mova m12, [warp_8x8_shufA] 4183 mova m13, [warp_8x8_shufB] 4184 vpbroadcastd m14, [pw_8192] 4185 vpbroadcastd m15, [pd_32768] 4186 pxor m11, m11 4187 lea filterq, [mc_warp_filter] 4188 lea tmp1q, [ssq*3+3] 4189 add mxd, 512+(64<<10) 4190 lea tmp2d, [alphaq*3] 4191 sub srcq, tmp1q ; src -= src_stride*3 + 3 4192 sub betad, tmp2d ; beta -= alpha*3 4193 mov myd, r7m 4194 call .h 4195 psrld m1, m0, 16 4196 call .h 4197 psrld m4, m0, 16 4198 call .h 4199 pblendw m1, m0, 0xaa ; 02 4200 call .h 4201 pblendw m4, m0, 0xaa ; 13 4202 call .h 4203 psrld m2, m1, 16 4204 pblendw m2, m0, 0xaa ; 24 4205 call .h 4206 psrld m5, m4, 16 4207 pblendw m5, m0, 0xaa ; 35 4208 call .h 4209 psrld m3, m2, 16 4210 pblendw m3, m0, 0xaa ; 46 4211 movsx deltad, word [abcdq+2*2] 4212 movsx gammad, word [abcdq+2*3] 4213 add myd, 512+(64<<10) 4214 mov r4d, 4 4215 lea tmp1d, [deltaq*3] 4216 sub gammad, tmp1d ; gamma -= delta*3 4217.main2: 4218 call .h 4219 psrld m6, m5, 16 4220 pblendw m6, m0, 0xaa ; 57 4221 WARP_V 7, 1, 3, 4, 6 4222 call .h 4223 mova m1, m2 4224 mova m2, m3 4225 psrld m3, 16 4226 pblendw m3, m0, 0xaa ; 68 4227 WARP_V 0, 4, 6, 1, 3 4228 mova m4, m5 4229 mova m5, m6 4230 ret 4231ALIGN function_align 4232.h: 4233 lea tmp1d, [mxq+alphaq*4] 4234 lea tmp2d, [mxq+alphaq*1] 4235 vbroadcasti128 m10, [srcq] 4236 shr mxd, 10 4237 shr tmp1d, 10 4238 movq xm8, [filterq+mxq *8] 4239 vinserti128 m8, [filterq+tmp1q*8], 1 4240 lea tmp1d, [tmp2q+alphaq*4] 4241 lea mxd, [tmp2q+alphaq*1] 4242 shr tmp2d, 10 4243 shr tmp1d, 10 4244 movq xm0, [filterq+tmp2q*8] 4245 vinserti128 m0, [filterq+tmp1q*8], 1 4246 lea tmp1d, [mxq+alphaq*4] 4247 lea tmp2d, [mxq+alphaq*1] 4248 shr mxd, 10 4249 shr tmp1d, 10 4250 movq xm9, [filterq+mxq *8] 4251 vinserti128 m9, [filterq+tmp1q*8], 1 4252 lea tmp1d, [tmp2q+alphaq*4] 4253 lea mxd, [tmp2q+betaq] ; mx += beta 4254 shr tmp2d, 10 4255 shr tmp1d, 10 4256 punpcklqdq m8, m0 ; 0 1 4 5 4257 movq xm0, [filterq+tmp2q*8] 4258 vinserti128 m0, [filterq+tmp1q*8], 1 4259 punpcklqdq m9, m0 ; 2 3 6 7 4260 pshufb m0, m10, m12 4261 pmaddubsw m0, m8 4262 pshufb m10, m13 4263 pmaddubsw m10, m9 4264 add srcq, ssq 4265 phaddw m0, m10 4266 pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 4267 paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword 4268 ret 4269 4270%macro BIDIR_FN 1 ; op 4271 %1 0 4272 lea stride3q, [strideq*3] 4273 jmp wq 4274.w4: 4275 vextracti128 xm1, m0, 1 4276 movd [dstq ], xm0 4277 pextrd [dstq+strideq*1], xm0, 1 4278 movd [dstq+strideq*2], xm1 4279 pextrd [dstq+stride3q ], xm1, 1 4280 cmp hd, 4 4281 je .ret 4282 lea dstq, [dstq+strideq*4] 4283 pextrd [dstq ], xm0, 2 4284 pextrd [dstq+strideq*1], xm0, 3 4285 pextrd [dstq+strideq*2], xm1, 2 4286 pextrd [dstq+stride3q ], xm1, 3 4287 cmp hd, 8 4288 je .ret 4289 %1 2 4290 lea dstq, [dstq+strideq*4] 4291 vextracti128 xm1, m0, 1 4292 movd [dstq ], xm0 4293 pextrd [dstq+strideq*1], xm0, 1 4294 movd [dstq+strideq*2], xm1 4295 pextrd [dstq+stride3q ], xm1, 1 4296 lea dstq, [dstq+strideq*4] 4297 pextrd [dstq ], xm0, 2 4298 pextrd [dstq+strideq*1], xm0, 3 4299 pextrd [dstq+strideq*2], xm1, 2 4300 pextrd [dstq+stride3q ], xm1, 3 4301.ret: 4302 RET 4303.w8_loop: 4304 %1_INC_PTR 2 4305 %1 0 4306 lea dstq, [dstq+strideq*4] 4307.w8: 4308 vextracti128 xm1, m0, 1 4309 movq [dstq ], xm0 4310 movq [dstq+strideq*1], xm1 4311 movhps [dstq+strideq*2], xm0 4312 movhps [dstq+stride3q ], xm1 4313 sub hd, 4 4314 jg .w8_loop 4315 RET 4316.w16_loop: 4317 %1_INC_PTR 4 4318 %1 0 4319 lea dstq, [dstq+strideq*4] 4320.w16: 4321 vpermq m0, m0, q3120 4322 mova [dstq ], xm0 4323 vextracti128 [dstq+strideq*1], m0, 1 4324 %1 2 4325 vpermq m0, m0, q3120 4326 mova [dstq+strideq*2], xm0 4327 vextracti128 [dstq+stride3q ], m0, 1 4328 sub hd, 4 4329 jg .w16_loop 4330 RET 4331.w32_loop: 4332 %1_INC_PTR 4 4333 %1 0 4334 lea dstq, [dstq+strideq*2] 4335.w32: 4336 vpermq m0, m0, q3120 4337 mova [dstq+strideq*0], m0 4338 %1 2 4339 vpermq m0, m0, q3120 4340 mova [dstq+strideq*1], m0 4341 sub hd, 2 4342 jg .w32_loop 4343 RET 4344.w64_loop: 4345 %1_INC_PTR 4 4346 %1 0 4347 add dstq, strideq 4348.w64: 4349 vpermq m0, m0, q3120 4350 mova [dstq], m0 4351 %1 2 4352 vpermq m0, m0, q3120 4353 mova [dstq+32], m0 4354 dec hd 4355 jg .w64_loop 4356 RET 4357.w128_loop: 4358 %1 0 4359 add dstq, strideq 4360.w128: 4361 vpermq m0, m0, q3120 4362 mova [dstq+0*32], m0 4363 %1 2 4364 vpermq m0, m0, q3120 4365 mova [dstq+1*32], m0 4366 %1_INC_PTR 8 4367 %1 -4 4368 vpermq m0, m0, q3120 4369 mova [dstq+2*32], m0 4370 %1 -2 4371 vpermq m0, m0, q3120 4372 mova [dstq+3*32], m0 4373 dec hd 4374 jg .w128_loop 4375 RET 4376%endmacro 4377 4378%macro AVG 1 ; src_offset 4379 mova m0, [tmp1q+(%1+0)*32] 4380 paddw m0, [tmp2q+(%1+0)*32] 4381 mova m1, [tmp1q+(%1+1)*32] 4382 paddw m1, [tmp2q+(%1+1)*32] 4383 pmulhrsw m0, m2 4384 pmulhrsw m1, m2 4385 packuswb m0, m1 4386%endmacro 4387 4388%macro AVG_INC_PTR 1 4389 add tmp1q, %1*32 4390 add tmp2q, %1*32 4391%endmacro 4392 4393cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 4394%define base r6-avg %+ SUFFIX %+ _table 4395 lea r6, [avg %+ SUFFIX %+ _table] 4396 tzcnt wd, wm 4397 movifnidn hd, hm 4398 movsxd wq, dword [r6+wq*4] 4399 vpbroadcastd m2, [base+pw_1024] 4400 add wq, r6 4401 BIDIR_FN AVG 4402 4403%macro W_AVG 1 ; src_offset 4404 ; (a * weight + b * (16 - weight) + 128) >> 8 4405 ; = ((a - b) * weight + (b << 4) + 128) >> 8 4406 ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 4407 ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 4408 mova m0, [tmp1q+(%1+0)*32] 4409 psubw m2, m0, [tmp2q+(%1+0)*32] 4410 mova m1, [tmp1q+(%1+1)*32] 4411 psubw m3, m1, [tmp2q+(%1+1)*32] 4412 pmulhw m2, m4 4413 pmulhw m3, m4 4414 paddw m0, m2 4415 paddw m1, m3 4416 pmulhrsw m0, m5 4417 pmulhrsw m1, m5 4418 packuswb m0, m1 4419%endmacro 4420 4421%define W_AVG_INC_PTR AVG_INC_PTR 4422 4423cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 4424%define base r6-w_avg %+ SUFFIX %+ _table 4425 lea r6, [w_avg %+ SUFFIX %+ _table] 4426 tzcnt wd, wm 4427 movifnidn hd, hm 4428 vpbroadcastw m4, r6m ; weight 4429 movsxd wq, dword [r6+wq*4] 4430 vpbroadcastd m5, [base+pw_2048] 4431 psllw m4, 12 ; (weight-16) << 12 when interpreted as signed 4432 add wq, r6 4433 cmp dword r6m, 7 4434 jg .weight_gt7 4435 mov r6, tmp1q 4436 pxor m0, m0 4437 mov tmp1q, tmp2q 4438 psubw m4, m0, m4 ; -weight 4439 mov tmp2q, r6 4440.weight_gt7: 4441 BIDIR_FN W_AVG 4442 4443%macro MASK 1 ; src_offset 4444 ; (a * m + b * (64 - m) + 512) >> 10 4445 ; = ((a - b) * m + (b << 6) + 512) >> 10 4446 ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 4447 vpermq m3, [maskq+%1*16], q3120 4448 mova m0, [tmp2q+(%1+0)*32] 4449 psubw m1, m0, [tmp1q+(%1+0)*32] 4450 psubb m3, m4, m3 4451 paddw m1, m1 ; (b - a) << 1 4452 paddb m3, m3 4453 punpcklbw m2, m4, m3 ; -m << 9 4454 pmulhw m1, m2 4455 paddw m0, m1 4456 mova m1, [tmp2q+(%1+1)*32] 4457 psubw m2, m1, [tmp1q+(%1+1)*32] 4458 paddw m2, m2 4459 punpckhbw m3, m4, m3 4460 pmulhw m2, m3 4461 paddw m1, m2 4462 pmulhrsw m0, m5 4463 pmulhrsw m1, m5 4464 packuswb m0, m1 4465%endmacro 4466 4467%macro MASK_INC_PTR 1 4468 add maskq, %1*16 4469 add tmp2q, %1*32 4470 add tmp1q, %1*32 4471%endmacro 4472 4473cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 4474%define base r7-mask %+ SUFFIX %+ _table 4475 lea r7, [mask %+ SUFFIX %+ _table] 4476 tzcnt wd, wm 4477 movifnidn hd, hm 4478 mov maskq, maskmp 4479 movsxd wq, dword [r7+wq*4] 4480 vpbroadcastd m5, [base+pw_2048] 4481 pxor m4, m4 4482 add wq, r7 4483 BIDIR_FN MASK 4484 4485%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 4486 mova m%1, [tmp1q+32*%3] 4487 mova m1, [tmp2q+32*%3] 4488 psubw m1, m%1 4489 pabsw m%2, m1 4490 psubusw m%2, m6, m%2 4491 psrlw m%2, 8 ; 64 - m 4492 psllw m2, m%2, 10 4493 pmulhw m1, m2 4494 paddw m%1, m1 4495 mova m1, [tmp1q+32*%4] 4496 mova m2, [tmp2q+32*%4] 4497 psubw m2, m1 4498 pabsw m3, m2 4499 psubusw m3, m6, m3 4500 psrlw m3, 8 4501%if %5 4502 packuswb m%2, m3 4503 psubb m%2, m5, m%2 4504 vpermq m%2, m%2, q3120 4505%else 4506 phaddw m%2, m3 4507%endif 4508 psllw m3, 10 4509 pmulhw m2, m3 4510 paddw m1, m2 4511 pmulhrsw m%1, m7 4512 pmulhrsw m1, m7 4513 packuswb m%1, m1 4514%endmacro 4515 4516cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask 4517%define base r6-blend_avx2_table 4518 lea r6, [blend_avx2_table] 4519 tzcnt wd, wm 4520 movifnidn hd, hm 4521 movifnidn maskq, maskmp 4522 movsxd wq, dword [r6+wq*4] 4523 vpbroadcastd m4, [base+pb_64] 4524 vpbroadcastd m5, [base+pw_512] 4525 add wq, r6 4526 lea r6, [dsq*3] 4527 jmp wq 4528.w4: 4529 movd xm0, [dstq+dsq*0] 4530 pinsrd xm0, [dstq+dsq*1], 1 4531 vpbroadcastd xm1, [dstq+dsq*2] 4532 pinsrd xm1, [dstq+r6 ], 3 4533 mova xm6, [maskq] 4534 psubb xm3, xm4, xm6 4535 punpcklbw xm2, xm3, xm6 4536 punpckhbw xm3, xm6 4537 mova xm6, [tmpq] 4538 add maskq, 4*4 4539 add tmpq, 4*4 4540 punpcklbw xm0, xm6 4541 punpckhbw xm1, xm6 4542 pmaddubsw xm0, xm2 4543 pmaddubsw xm1, xm3 4544 pmulhrsw xm0, xm5 4545 pmulhrsw xm1, xm5 4546 packuswb xm0, xm1 4547 movd [dstq+dsq*0], xm0 4548 pextrd [dstq+dsq*1], xm0, 1 4549 pextrd [dstq+dsq*2], xm0, 2 4550 pextrd [dstq+r6 ], xm0, 3 4551 lea dstq, [dstq+dsq*4] 4552 sub hd, 4 4553 jg .w4 4554 RET 4555ALIGN function_align 4556.w8: 4557 movq xm1, [dstq+dsq*0] 4558 movhps xm1, [dstq+dsq*1] 4559 vpbroadcastq m2, [dstq+dsq*2] 4560 vpbroadcastq m3, [dstq+r6 ] 4561 mova m0, [maskq] 4562 mova m6, [tmpq] 4563 add maskq, 8*4 4564 add tmpq, 8*4 4565 vpblendd m1, m2, 0x30 4566 vpblendd m1, m3, 0xc0 4567 psubb m3, m4, m0 4568 punpcklbw m2, m3, m0 4569 punpckhbw m3, m0 4570 punpcklbw m0, m1, m6 4571 punpckhbw m1, m6 4572 pmaddubsw m0, m2 4573 pmaddubsw m1, m3 4574 pmulhrsw m0, m5 4575 pmulhrsw m1, m5 4576 packuswb m0, m1 4577 vextracti128 xm1, m0, 1 4578 movq [dstq+dsq*0], xm0 4579 movhps [dstq+dsq*1], xm0 4580 movq [dstq+dsq*2], xm1 4581 movhps [dstq+r6 ], xm1 4582 lea dstq, [dstq+dsq*4] 4583 sub hd, 4 4584 jg .w8 4585 RET 4586ALIGN function_align 4587.w16: 4588 mova m0, [maskq] 4589 mova xm1, [dstq+dsq*0] 4590 vinserti128 m1, [dstq+dsq*1], 1 4591 psubb m3, m4, m0 4592 punpcklbw m2, m3, m0 4593 punpckhbw m3, m0 4594 mova m6, [tmpq] 4595 add maskq, 16*2 4596 add tmpq, 16*2 4597 punpcklbw m0, m1, m6 4598 punpckhbw m1, m6 4599 pmaddubsw m0, m2 4600 pmaddubsw m1, m3 4601 pmulhrsw m0, m5 4602 pmulhrsw m1, m5 4603 packuswb m0, m1 4604 mova [dstq+dsq*0], xm0 4605 vextracti128 [dstq+dsq*1], m0, 1 4606 lea dstq, [dstq+dsq*2] 4607 sub hd, 2 4608 jg .w16 4609 RET 4610ALIGN function_align 4611.w32: 4612 mova m0, [maskq] 4613 mova m1, [dstq] 4614 mova m6, [tmpq] 4615 add maskq, 32 4616 add tmpq, 32 4617 psubb m3, m4, m0 4618 punpcklbw m2, m3, m0 4619 punpckhbw m3, m0 4620 punpcklbw m0, m1, m6 4621 punpckhbw m1, m6 4622 pmaddubsw m0, m2 4623 pmaddubsw m1, m3 4624 pmulhrsw m0, m5 4625 pmulhrsw m1, m5 4626 packuswb m0, m1 4627 mova [dstq], m0 4628 add dstq, dsq 4629 dec hd 4630 jg .w32 4631 RET 4632 4633cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask 4634%define base r5-blend_v_avx2_table 4635 lea r5, [blend_v_avx2_table] 4636 tzcnt wd, wm 4637 movifnidn hd, hm 4638 movsxd wq, dword [r5+wq*4] 4639 vpbroadcastd m5, [base+pw_512] 4640 add wq, r5 4641 add maskq, obmc_masks-blend_v_avx2_table 4642 jmp wq 4643.w2: 4644 vpbroadcastd xm2, [maskq+2*2] 4645.w2_s0_loop: 4646 movd xm0, [dstq+dsq*0] 4647 pinsrw xm0, [dstq+dsq*1], 1 4648 movd xm1, [tmpq] 4649 add tmpq, 2*2 4650 punpcklbw xm0, xm1 4651 pmaddubsw xm0, xm2 4652 pmulhrsw xm0, xm5 4653 packuswb xm0, xm0 4654 pextrw [dstq+dsq*0], xm0, 0 4655 pextrw [dstq+dsq*1], xm0, 1 4656 lea dstq, [dstq+dsq*2] 4657 sub hd, 2 4658 jg .w2_s0_loop 4659 RET 4660ALIGN function_align 4661.w4: 4662 vpbroadcastq xm2, [maskq+4*2] 4663.w4_loop: 4664 movd xm0, [dstq+dsq*0] 4665 pinsrd xm0, [dstq+dsq*1], 1 4666 movq xm1, [tmpq] 4667 add tmpq, 4*2 4668 punpcklbw xm0, xm1 4669 pmaddubsw xm0, xm2 4670 pmulhrsw xm0, xm5 4671 packuswb xm0, xm0 4672 movd [dstq+dsq*0], xm0 4673 pextrd [dstq+dsq*1], xm0, 1 4674 lea dstq, [dstq+dsq*2] 4675 sub hd, 2 4676 jg .w4_loop 4677 RET 4678ALIGN function_align 4679.w8: 4680 vbroadcasti128 m4, [maskq+8*2] 4681.w8_loop: 4682 vpbroadcastq m2, [dstq+dsq*0] 4683 movq xm0, [dstq+dsq*1] 4684 vpblendd m0, m2, 0x30 4685 movq xm1, [tmpq+8*1] 4686 vinserti128 m1, [tmpq+8*0], 1 4687 add tmpq, 8*2 4688 punpcklbw m0, m1 4689 pmaddubsw m0, m4 4690 pmulhrsw m0, m5 4691 vextracti128 xm1, m0, 1 4692 packuswb xm0, xm1 4693 movhps [dstq+dsq*0], xm0 4694 movq [dstq+dsq*1], xm0 4695 lea dstq, [dstq+dsq*2] 4696 sub hd, 2 4697 jg .w8_loop 4698 RET 4699ALIGN function_align 4700.w16: 4701 vbroadcasti128 m3, [maskq+16*2] 4702 vbroadcasti128 m4, [maskq+16*3] 4703.w16_loop: 4704 mova xm1, [dstq+dsq*0] 4705 vinserti128 m1, [dstq+dsq*1], 1 4706 mova m2, [tmpq] 4707 add tmpq, 16*2 4708 punpcklbw m0, m1, m2 4709 punpckhbw m1, m2 4710 pmaddubsw m0, m3 4711 pmaddubsw m1, m4 4712 pmulhrsw m0, m5 4713 pmulhrsw m1, m5 4714 packuswb m0, m1 4715 mova [dstq+dsq*0], xm0 4716 vextracti128 [dstq+dsq*1], m0, 1 4717 lea dstq, [dstq+dsq*2] 4718 sub hd, 2 4719 jg .w16_loop 4720 RET 4721ALIGN function_align 4722.w32: 4723 mova xm3, [maskq+16*4] 4724 vinserti128 m3, [maskq+16*6], 1 4725 mova xm4, [maskq+16*5] 4726 vinserti128 m4, [maskq+16*7], 1 4727.w32_loop: 4728 mova m1, [dstq] 4729 mova m2, [tmpq] 4730 add tmpq, 32 4731 punpcklbw m0, m1, m2 4732 punpckhbw m1, m2 4733 pmaddubsw m0, m3 4734 pmaddubsw m1, m4 4735 pmulhrsw m0, m5 4736 pmulhrsw m1, m5 4737 packuswb m0, m1 4738 mova [dstq], m0 4739 add dstq, dsq 4740 dec hd 4741 jg .w32_loop 4742 RET 4743 4744cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask 4745%define base r5-blend_h_avx2_table 4746 lea r5, [blend_h_avx2_table] 4747 mov r6d, wd 4748 tzcnt wd, wd 4749 mov hd, hm 4750 movsxd wq, dword [r5+wq*4] 4751 vpbroadcastd m5, [base+pw_512] 4752 add wq, r5 4753 lea maskq, [base+obmc_masks+hq*2] 4754 lea hd, [hq*3] 4755 shr hd, 2 ; h * 3/4 4756 lea maskq, [maskq+hq*2] 4757 neg hq 4758 jmp wq 4759.w2: 4760 movd xm0, [dstq+dsq*0] 4761 pinsrw xm0, [dstq+dsq*1], 1 4762 movd xm2, [maskq+hq*2] 4763 movd xm1, [tmpq] 4764 add tmpq, 2*2 4765 punpcklwd xm2, xm2 4766 punpcklbw xm0, xm1 4767 pmaddubsw xm0, xm2 4768 pmulhrsw xm0, xm5 4769 packuswb xm0, xm0 4770 pextrw [dstq+dsq*0], xm0, 0 4771 pextrw [dstq+dsq*1], xm0, 1 4772 lea dstq, [dstq+dsq*2] 4773 add hq, 2 4774 jl .w2 4775 RET 4776ALIGN function_align 4777.w4: 4778 mova xm3, [blend_shuf] 4779.w4_loop: 4780 movd xm0, [dstq+dsq*0] 4781 pinsrd xm0, [dstq+dsq*1], 1 4782 movd xm2, [maskq+hq*2] 4783 movq xm1, [tmpq] 4784 add tmpq, 4*2 4785 pshufb xm2, xm3 4786 punpcklbw xm0, xm1 4787 pmaddubsw xm0, xm2 4788 pmulhrsw xm0, xm5 4789 packuswb xm0, xm0 4790 movd [dstq+dsq*0], xm0 4791 pextrd [dstq+dsq*1], xm0, 1 4792 lea dstq, [dstq+dsq*2] 4793 add hq, 2 4794 jl .w4_loop 4795 RET 4796ALIGN function_align 4797.w8: 4798 vbroadcasti128 m4, [blend_shuf] 4799 shufpd m4, m4, 0x03 4800.w8_loop: 4801 vpbroadcastq m1, [dstq+dsq*0] 4802 movq xm0, [dstq+dsq*1] 4803 vpblendd m0, m1, 0x30 4804 vpbroadcastd m3, [maskq+hq*2] 4805 movq xm1, [tmpq+8*1] 4806 vinserti128 m1, [tmpq+8*0], 1 4807 add tmpq, 8*2 4808 pshufb m3, m4 4809 punpcklbw m0, m1 4810 pmaddubsw m0, m3 4811 pmulhrsw m0, m5 4812 vextracti128 xm1, m0, 1 4813 packuswb xm0, xm1 4814 movhps [dstq+dsq*0], xm0 4815 movq [dstq+dsq*1], xm0 4816 lea dstq, [dstq+dsq*2] 4817 add hq, 2 4818 jl .w8_loop 4819 RET 4820ALIGN function_align 4821.w16: 4822 vbroadcasti128 m4, [blend_shuf] 4823 shufpd m4, m4, 0x0c 4824.w16_loop: 4825 mova xm1, [dstq+dsq*0] 4826 vinserti128 m1, [dstq+dsq*1], 1 4827 vpbroadcastd m3, [maskq+hq*2] 4828 mova m2, [tmpq] 4829 add tmpq, 16*2 4830 pshufb m3, m4 4831 punpcklbw m0, m1, m2 4832 punpckhbw m1, m2 4833 pmaddubsw m0, m3 4834 pmaddubsw m1, m3 4835 pmulhrsw m0, m5 4836 pmulhrsw m1, m5 4837 packuswb m0, m1 4838 mova [dstq+dsq*0], xm0 4839 vextracti128 [dstq+dsq*1], m0, 1 4840 lea dstq, [dstq+dsq*2] 4841 add hq, 2 4842 jl .w16_loop 4843 RET 4844ALIGN function_align 4845.w32: ; w32/w64/w128 4846 sub dsq, r6 4847.w32_loop0: 4848 vpbroadcastw m3, [maskq+hq*2] 4849 mov wd, r6d 4850.w32_loop: 4851 mova m1, [dstq] 4852 mova m2, [tmpq] 4853 add tmpq, 32 4854 punpcklbw m0, m1, m2 4855 punpckhbw m1, m2 4856 pmaddubsw m0, m3 4857 pmaddubsw m1, m3 4858 pmulhrsw m0, m5 4859 pmulhrsw m1, m5 4860 packuswb m0, m1 4861 mova [dstq], m0 4862 add dstq, 32 4863 sub wd, 32 4864 jg .w32_loop 4865 add dstq, dsq 4866 inc hq 4867 jl .w32_loop0 4868 RET 4869 4870cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ 4871 bottomext, rightext 4872 ; we assume that the buffer (stride) is larger than width, so we can 4873 ; safely overwrite by a few bytes 4874 4875 ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 4876 xor r12d, r12d 4877 lea r10, [ihq-1] 4878 cmp yq, ihq 4879 cmovs r10, yq 4880 test yq, yq 4881 cmovs r10, r12 4882 imul r10, sstrideq 4883 add srcq, r10 4884 4885 ; ref += iclip(x, 0, iw - 1) 4886 lea r10, [iwq-1] 4887 cmp xq, iwq 4888 cmovs r10, xq 4889 test xq, xq 4890 cmovs r10, r12 4891 add srcq, r10 4892 4893 ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) 4894 lea bottomextq, [yq+bhq] 4895 sub bottomextq, ihq 4896 lea r3, [bhq-1] 4897 cmovs bottomextq, r12 4898 4899 DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ 4900 bottomext, rightext 4901 4902 ; top_ext = iclip(-y, 0, bh - 1) 4903 neg topextq 4904 cmovs topextq, r12 4905 cmp bottomextq, bhq 4906 cmovns bottomextq, r3 4907 cmp topextq, bhq 4908 cmovg topextq, r3 4909 4910 ; right_ext = iclip(x + bw - iw, 0, bw - 1) 4911 lea rightextq, [xq+bwq] 4912 sub rightextq, iwq 4913 lea r2, [bwq-1] 4914 cmovs rightextq, r12 4915 4916 DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ 4917 bottomext, rightext 4918 4919 ; left_ext = iclip(-x, 0, bw - 1) 4920 neg leftextq 4921 cmovs leftextq, r12 4922 cmp rightextq, bwq 4923 cmovns rightextq, r2 4924 cmp leftextq, bwq 4925 cmovns leftextq, r2 4926 4927 DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ 4928 dst, dstride, src, sstride, bottomext, rightext 4929 4930 ; center_h = bh - top_ext - bottom_ext 4931 lea r3, [bottomextq+topextq] 4932 sub centerhq, r3 4933 4934 ; blk += top_ext * PXSTRIDE(dst_stride) 4935 mov r2, topextq 4936 imul r2, dstrideq 4937 add dstq, r2 4938 mov r9m, dstq 4939 4940 ; center_w = bw - left_ext - right_ext 4941 mov centerwq, bwq 4942 lea r3, [rightextq+leftextq] 4943 sub centerwq, r3 4944 4945%macro v_loop 3 ; need_left_ext, need_right_ext, suffix 4946.v_loop_%3: 4947%if %1 4948 ; left extension 4949 xor r3, r3 4950 vpbroadcastb m0, [srcq] 4951.left_loop_%3: 4952 mova [dstq+r3], m0 4953 add r3, 32 4954 cmp r3, leftextq 4955 jl .left_loop_%3 4956 4957 ; body 4958 lea r12, [dstq+leftextq] 4959%endif 4960 xor r3, r3 4961.body_loop_%3: 4962 movu m0, [srcq+r3] 4963%if %1 4964 movu [r12+r3], m0 4965%else 4966 movu [dstq+r3], m0 4967%endif 4968 add r3, 32 4969 cmp r3, centerwq 4970 jl .body_loop_%3 4971 4972%if %2 4973 ; right extension 4974%if %1 4975 add r12, centerwq 4976%else 4977 lea r12, [dstq+centerwq] 4978%endif 4979 xor r3, r3 4980 vpbroadcastb m0, [srcq+centerwq-1] 4981.right_loop_%3: 4982 movu [r12+r3], m0 4983 add r3, 32 4984 cmp r3, rightextq 4985 jl .right_loop_%3 4986 4987%endif 4988 add dstq, dstrideq 4989 add srcq, sstrideq 4990 dec centerhq 4991 jg .v_loop_%3 4992%endmacro 4993 4994 test leftextq, leftextq 4995 jnz .need_left_ext 4996 test rightextq, rightextq 4997 jnz .need_right_ext 4998 v_loop 0, 0, 0 4999 jmp .body_done 5000 5001.need_left_ext: 5002 test rightextq, rightextq 5003 jnz .need_left_right_ext 5004 v_loop 1, 0, 1 5005 jmp .body_done 5006 5007.need_left_right_ext: 5008 v_loop 1, 1, 2 5009 jmp .body_done 5010 5011.need_right_ext: 5012 v_loop 0, 1, 3 5013 5014.body_done: 5015 ; bottom edge extension 5016 test bottomextq, bottomextq 5017 jz .top 5018 mov srcq, dstq 5019 sub srcq, dstrideq 5020 xor r1, r1 5021.bottom_x_loop: 5022 mova m0, [srcq+r1] 5023 lea r3, [dstq+r1] 5024 mov r4, bottomextq 5025.bottom_y_loop: 5026 mova [r3], m0 5027 add r3, dstrideq 5028 dec r4 5029 jg .bottom_y_loop 5030 add r1, 32 5031 cmp r1, bwq 5032 jl .bottom_x_loop 5033 5034.top: 5035 ; top edge extension 5036 test topextq, topextq 5037 jz .end 5038 mov srcq, r9m 5039 mov dstq, dstm 5040 xor r1, r1 5041.top_x_loop: 5042 mova m0, [srcq+r1] 5043 lea r3, [dstq+r1] 5044 mov r4, topextq 5045.top_y_loop: 5046 mova [r3], m0 5047 add r3, dstrideq 5048 dec r4 5049 jg .top_y_loop 5050 add r1, 32 5051 cmp r1, bwq 5052 jl .top_x_loop 5053 5054.end: 5055 RET 5056 5057cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \ 5058 dst_w, h, src_w, dx, mx0 5059 sub dword mx0m, 4<<14 5060 sub dword src_wm, 8 5061 vpbroadcastd m5, dxm 5062 vpbroadcastd m8, mx0m 5063 vpbroadcastd m6, src_wm 5064 5065 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr 5066 LEA r7, $$ 5067%define base r7-$$ 5068 5069 vpbroadcastd m3, [base+pw_m256] 5070 vpbroadcastd m7, [base+pd_63] 5071 vbroadcasti128 m15, [base+pb_8x0_8x8] 5072 pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] 5073 pslld m5, 3 ; dx*8 5074 pslld m6, 14 5075 paddd m8, m2 ; mx+[0..7]*dx 5076 pxor m2, m2 5077 5078 ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7 5079 ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8 5080 5081.loop_y: 5082 xor xd, xd 5083 mova m4, m8 ; per-line working version of mx 5084 5085.loop_x: 5086 pmaxsd m0, m4, m2 5087 psrad m9, m4, 8 ; filter offset (unmasked) 5088 pminsd m0, m6 ; iclip(mx, 0, src_w-8) 5089 psubd m1, m4, m0 ; pshufb offset 5090 psrad m0, 14 ; clipped src_x offset 5091 psrad m1, 14 ; pshufb edge_emu offset 5092 pand m9, m7 ; filter offset (masked) 5093 5094 ; load source pixels - this ugly code is vpgatherdq emulation since 5095 ; directly using vpgatherdq on Haswell is quite a bit slower :( 5096 movd r8d, xm0 5097 pextrd r9d, xm0, 1 5098 pextrd r10d, xm0, 2 5099 pextrd r11d, xm0, 3 5100 vextracti128 xm0, m0, 1 5101 movq xm12, [srcq+r8] 5102 movq xm13, [srcq+r10] 5103 movhps xm12, [srcq+r9] 5104 movhps xm13, [srcq+r11] 5105 movd r8d, xm0 5106 pextrd r9d, xm0, 1 5107 pextrd r10d, xm0, 2 5108 pextrd r11d, xm0, 3 5109 vinserti128 m12, [srcq+r8], 1 5110 vinserti128 m13, [srcq+r10], 1 5111 vpbroadcastq m10, [srcq+r9] 5112 vpbroadcastq m11, [srcq+r11] 5113 vpblendd m12, m10, 11000000b 5114 vpblendd m13, m11, 11000000b 5115 5116 ; if no emulation is required, we don't need to shuffle or emulate edges 5117 ; this also saves 2 quasi-vpgatherdqs 5118 vptest m1, m1 5119 jz .filter 5120 5121 movd r8d, xm1 5122 pextrd r9d, xm1, 1 5123 pextrd r10d, xm1, 2 5124 pextrd r11d, xm1, 3 5125 movsxd r8, r8d 5126 movsxd r9, r9d 5127 movsxd r10, r10d 5128 movsxd r11, r11d 5129 vextracti128 xm1, m1, 1 5130 movq xm14, [base+resize_shuf+4+r8] 5131 movq xm0, [base+resize_shuf+4+r10] 5132 movhps xm14, [base+resize_shuf+4+r9] 5133 movhps xm0, [base+resize_shuf+4+r11] 5134 movd r8d, xm1 5135 pextrd r9d, xm1, 1 5136 pextrd r10d, xm1, 2 5137 pextrd r11d, xm1, 3 5138 movsxd r8, r8d 5139 movsxd r9, r9d 5140 movsxd r10, r10d 5141 movsxd r11, r11d 5142 vinserti128 m14, [base+resize_shuf+4+r8], 1 5143 vinserti128 m0, [base+resize_shuf+4+r10], 1 5144 vpbroadcastq m10, [base+resize_shuf+4+r9] 5145 vpbroadcastq m11, [base+resize_shuf+4+r11] 5146 vpblendd m14, m10, 11000000b 5147 vpblendd m0, m11, 11000000b 5148 5149 paddb m14, m15 5150 paddb m0, m15 5151 pshufb m12, m14 5152 pshufb m13, m0 5153 5154.filter: 5155 movd r8d, xm9 5156 pextrd r9d, xm9, 1 5157 pextrd r10d, xm9, 2 5158 pextrd r11d, xm9, 3 5159 vextracti128 xm9, m9, 1 5160 movq xm10, [base+resize_filter+r8*8] 5161 movq xm11, [base+resize_filter+r10*8] 5162 movhps xm10, [base+resize_filter+r9*8] 5163 movhps xm11, [base+resize_filter+r11*8] 5164 movd r8d, xm9 5165 pextrd r9d, xm9, 1 5166 pextrd r10d, xm9, 2 5167 pextrd r11d, xm9, 3 5168 vinserti128 m10, [base+resize_filter+r8*8], 1 5169 vinserti128 m11, [base+resize_filter+r10*8], 1 5170 vpbroadcastq m14, [base+resize_filter+r9*8] 5171 vpbroadcastq m1, [base+resize_filter+r11*8] 5172 vpblendd m10, m14, 11000000b 5173 vpblendd m11, m1, 11000000b 5174 5175 pmaddubsw m12, m10 5176 pmaddubsw m13, m11 5177 phaddw m12, m13 5178 vextracti128 xm13, m12, 1 5179 phaddsw xm12, xm13 5180 pmulhrsw xm12, xm3 ; x=(x+64)>>7 5181 packuswb xm12, xm12 5182 movq [dstq+xq], xm12 5183 5184 paddd m4, m5 5185 add xd, 8 5186 cmp xd, dst_wd 5187 jl .loop_x 5188 5189 add dstq, dst_strideq 5190 add srcq, src_strideq 5191 dec hd 5192 jg .loop_y 5193 RET 5194 5195cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 5196%define base r7-w_mask_420_avx2_table 5197 lea r7, [w_mask_420_avx2_table] 5198 tzcnt wd, wm 5199 mov r6d, r7m ; sign 5200 movifnidn hd, hm 5201 movsxd wq, [r7+wq*4] 5202 vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 5203 vpbroadcastd m7, [base+pw_2048] 5204 pmovzxbd m9, [base+deint_shuf4] 5205 vpbroadcastd m8, [base+wm_420_sign+r6*4] ; 258 - sign 5206 add wq, r7 5207 W_MASK 0, 4, 0, 1 5208 mov maskq, maskmp 5209 lea stride3q, [strideq*3] 5210 jmp wq 5211.w4: 5212 vextracti128 xm1, m0, 1 5213 movd [dstq+strideq*0], xm0 5214 pextrd [dstq+strideq*1], xm0, 1 5215 movd [dstq+strideq*2], xm1 5216 pextrd [dstq+stride3q ], xm1, 1 5217 cmp hd, 8 5218 jl .w4_end 5219 lea dstq, [dstq+strideq*4] 5220 pextrd [dstq+strideq*0], xm0, 2 5221 pextrd [dstq+strideq*1], xm0, 3 5222 pextrd [dstq+strideq*2], xm1, 2 5223 pextrd [dstq+stride3q ], xm1, 3 5224 jg .w4_h16 5225.w4_end: 5226 vextracti128 xm0, m4, 1 5227 vpblendd xm1, xm4, xm0, 0x05 5228 vpblendd xm4, xm0, 0x0a 5229 pshufd xm1, xm1, q2301 5230 psubw xm4, xm8, xm4 5231 psubw xm4, xm1 5232 psrlw xm4, 2 5233 packuswb xm4, xm4 5234 movq [maskq], xm4 5235 RET 5236.w4_h16: 5237 W_MASK 0, 5, 2, 3 5238 lea dstq, [dstq+strideq*4] 5239 phaddd m4, m5 5240 vextracti128 xm1, m0, 1 5241 psubw m4, m8, m4 5242 psrlw m4, 2 5243 vpermd m4, m9, m4 5244 vextracti128 xm5, m4, 1 5245 packuswb xm4, xm5 5246 movd [dstq+strideq*0], xm0 5247 pextrd [dstq+strideq*1], xm0, 1 5248 movd [dstq+strideq*2], xm1 5249 pextrd [dstq+stride3q], xm1, 1 5250 lea dstq, [dstq+strideq*4] 5251 pextrd [dstq+strideq*0], xm0, 2 5252 pextrd [dstq+strideq*1], xm0, 3 5253 pextrd [dstq+strideq*2], xm1, 2 5254 pextrd [dstq+stride3q ], xm1, 3 5255 mova [maskq], xm4 5256 RET 5257.w8_loop: 5258 add tmp1q, 2*32 5259 add tmp2q, 2*32 5260 W_MASK 0, 4, 0, 1 5261 lea dstq, [dstq+strideq*4] 5262 add maskq, 8 5263.w8: 5264 vextracti128 xm2, m4, 1 5265 vextracti128 xm1, m0, 1 5266 psubw xm4, xm8, xm4 5267 psubw xm4, xm2 5268 psrlw xm4, 2 5269 packuswb xm4, xm4 5270 movq [dstq+strideq*0], xm0 5271 movq [dstq+strideq*1], xm1 5272 movhps [dstq+strideq*2], xm0 5273 movhps [dstq+stride3q ], xm1 5274 movq [maskq], xm4 5275 sub hd, 4 5276 jg .w8_loop 5277 RET 5278.w16_loop: 5279 add tmp1q, 4*32 5280 add tmp2q, 4*32 5281 W_MASK 0, 4, 0, 1 5282 lea dstq, [dstq+strideq*4] 5283 add maskq, 16 5284.w16: 5285 vpermq m0, m0, q3120 5286 mova [dstq+strideq*0], xm0 5287 vextracti128 [dstq+strideq*1], m0, 1 5288 W_MASK 0, 5, 2, 3 5289 punpckhqdq m1, m4, m5 5290 punpcklqdq m4, m5 5291 psubw m1, m8, m1 5292 psubw m1, m4 5293 psrlw m1, 2 5294 vpermq m0, m0, q3120 5295 packuswb m1, m1 5296 vpermd m1, m9, m1 5297 mova [dstq+strideq*2], xm0 5298 vextracti128 [dstq+stride3q ], m0, 1 5299 mova [maskq], xm1 5300 sub hd, 4 5301 jg .w16_loop 5302 RET 5303.w32_loop: 5304 add tmp1q, 4*32 5305 add tmp2q, 4*32 5306 W_MASK 0, 4, 0, 1 5307 lea dstq, [dstq+strideq*2] 5308 add maskq, 16 5309.w32: 5310 vpermq m0, m0, q3120 5311 mova [dstq+strideq*0], m0 5312 W_MASK 0, 5, 2, 3 5313 psubw m4, m8, m4 5314 psubw m4, m5 5315 psrlw m4, 2 5316 vpermq m0, m0, q3120 5317 packuswb m4, m4 5318 vpermd m4, m9, m4 5319 mova [dstq+strideq*1], m0 5320 mova [maskq], xm4 5321 sub hd, 2 5322 jg .w32_loop 5323 RET 5324.w64_loop_even: 5325 psubw m10, m8, m4 5326 psubw m11, m8, m5 5327 dec hd 5328.w64_loop: 5329 add tmp1q, 4*32 5330 add tmp2q, 4*32 5331 W_MASK 0, 4, 0, 1 5332 add dstq, strideq 5333.w64: 5334 vpermq m0, m0, q3120 5335 mova [dstq+32*0], m0 5336 W_MASK 0, 5, 2, 3 5337 vpermq m0, m0, q3120 5338 mova [dstq+32*1], m0 5339 test hd, 1 5340 jz .w64_loop_even 5341 psubw m4, m10, m4 5342 psubw m5, m11, m5 5343 psrlw m4, 2 5344 psrlw m5, 2 5345 packuswb m4, m5 5346 vpermd m4, m9, m4 5347 mova [maskq], m4 5348 add maskq, 32 5349 dec hd 5350 jg .w64_loop 5351 RET 5352.w128_loop_even: 5353 psubw m12, m8, m4 5354 psubw m13, m8, m5 5355 dec hd 5356.w128_loop: 5357 W_MASK 0, 4, 0, 1 5358 add dstq, strideq 5359.w128: 5360 vpermq m0, m0, q3120 5361 mova [dstq+32*0], m0 5362 W_MASK 0, 5, 2, 3 5363 vpermq m0, m0, q3120 5364 mova [dstq+32*1], m0 5365 add tmp1q, 8*32 5366 add tmp2q, 8*32 5367 test hd, 1 5368 jz .w128_even 5369 psubw m4, m10, m4 5370 psubw m5, m11, m5 5371 psrlw m4, 2 5372 psrlw m5, 2 5373 packuswb m4, m5 5374 vpermd m4, m9, m4 5375 mova [maskq+32*0], m4 5376 jmp .w128_odd 5377.w128_even: 5378 psubw m10, m8, m4 5379 psubw m11, m8, m5 5380.w128_odd: 5381 W_MASK 0, 4, -4, -3 5382 vpermq m0, m0, q3120 5383 mova [dstq+32*2], m0 5384 W_MASK 0, 5, -2, -1 5385 vpermq m0, m0, q3120 5386 mova [dstq+32*3], m0 5387 test hd, 1 5388 jz .w128_loop_even 5389 psubw m4, m12, m4 5390 psubw m5, m13, m5 5391 psrlw m4, 2 5392 psrlw m5, 2 5393 packuswb m4, m5 5394 vpermd m4, m9, m4 5395 mova [maskq+32*1], m4 5396 add maskq, 64 5397 dec hd 5398 jg .w128_loop 5399 RET 5400 5401cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 5402%define base r7-w_mask_422_avx2_table 5403 lea r7, [w_mask_422_avx2_table] 5404 tzcnt wd, wm 5405 mov r6d, r7m ; sign 5406 movifnidn hd, hm 5407 pxor m9, m9 5408 movsxd wq, dword [r7+wq*4] 5409 vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 5410 vpbroadcastd m7, [base+pw_2048] 5411 pmovzxbd m10, [base+deint_shuf4] 5412 vpbroadcastd m8, [base+wm_422_sign+r6*4] ; 128 - sign 5413 add wq, r7 5414 mov maskq, maskmp 5415 W_MASK 0, 4, 0, 1 5416 lea stride3q, [strideq*3] 5417 jmp wq 5418.w4: 5419 vextracti128 xm1, m0, 1 5420 movd [dstq+strideq*0], xm0 5421 pextrd [dstq+strideq*1], xm0, 1 5422 movd [dstq+strideq*2], xm1 5423 pextrd [dstq+stride3q ], xm1, 1 5424 cmp hd, 8 5425 jl .w4_end 5426 lea dstq, [dstq+strideq*4] 5427 pextrd [dstq+strideq*0], xm0, 2 5428 pextrd [dstq+strideq*1], xm0, 3 5429 pextrd [dstq+strideq*2], xm1, 2 5430 pextrd [dstq+stride3q ], xm1, 3 5431 jg .w4_h16 5432.w4_end: 5433 vextracti128 xm5, m4, 1 5434 packuswb xm4, xm5 5435 psubb xm5, xm8, xm4 5436 pavgb xm5, xm9 5437 pshufd xm5, xm5, q3120 5438 mova [maskq], xm5 5439 RET 5440.w4_h16: 5441 W_MASK 0, 5, 2, 3 5442 lea dstq, [dstq+strideq*4] 5443 packuswb m4, m5 5444 psubb m5, m8, m4 5445 pavgb m5, m9 5446 vpermd m5, m10, m5 5447 vextracti128 xm1, m0, 1 5448 movd [dstq+strideq*0], xm0 5449 pextrd [dstq+strideq*1], xm0, 1 5450 movd [dstq+strideq*2], xm1 5451 pextrd [dstq+stride3q ], xm1, 1 5452 lea dstq, [dstq+strideq*4] 5453 pextrd [dstq+strideq*0], xm0, 2 5454 pextrd [dstq+strideq*1], xm0, 3 5455 pextrd [dstq+strideq*2], xm1, 2 5456 pextrd [dstq+stride3q ], xm1, 3 5457 mova [maskq], m5 5458 RET 5459.w8_loop: 5460 add tmp1q, 32*2 5461 add tmp2q, 32*2 5462 W_MASK 0, 4, 0, 1 5463 lea dstq, [dstq+strideq*4] 5464 add maskq, 16 5465.w8: 5466 vextracti128 xm5, m4, 1 5467 vextracti128 xm1, m0, 1 5468 packuswb xm4, xm5 5469 psubb xm5, xm8, xm4 5470 pavgb xm5, xm9 5471 pshufd xm5, xm5, q3120 5472 movq [dstq+strideq*0], xm0 5473 movq [dstq+strideq*1], xm1 5474 movhps [dstq+strideq*2], xm0 5475 movhps [dstq+stride3q ], xm1 5476 mova [maskq], xm5 5477 sub hd, 4 5478 jg .w8_loop 5479 RET 5480.w16_loop: 5481 add tmp1q, 32*4 5482 add tmp2q, 32*4 5483 W_MASK 0, 4, 0, 1 5484 lea dstq, [dstq+strideq*4] 5485 add maskq, 32 5486.w16: 5487 vpermq m0, m0, q3120 5488 mova [dstq+strideq*0], xm0 5489 vextracti128 [dstq+strideq*1], m0, 1 5490 W_MASK 0, 5, 2, 3 5491 packuswb m4, m5 5492 psubb m5, m8, m4 5493 pavgb m5, m9 5494 vpermq m0, m0, q3120 5495 vpermd m5, m10, m5 5496 mova [dstq+strideq*2], xm0 5497 vextracti128 [dstq+stride3q ], m0, 1 5498 mova [maskq], m5 5499 sub hd, 4 5500 jg .w16_loop 5501 RET 5502.w32_loop: 5503 add tmp1q, 32*4 5504 add tmp2q, 32*4 5505 W_MASK 0, 4, 0, 1 5506 lea dstq, [dstq+strideq*2] 5507 add maskq, 32 5508.w32: 5509 vpermq m0, m0, q3120 5510 mova [dstq+strideq*0], m0 5511 W_MASK 0, 5, 2, 3 5512 packuswb m4, m5 5513 psubb m5, m8, m4 5514 pavgb m5, m9 5515 vpermq m0, m0, q3120 5516 vpermd m5, m10, m5 5517 mova [dstq+strideq*1], m0 5518 mova [maskq], m5 5519 sub hd, 2 5520 jg .w32_loop 5521 RET 5522.w64_loop: 5523 add tmp1q, 32*4 5524 add tmp2q, 32*4 5525 W_MASK 0, 4, 0, 1 5526 add dstq, strideq 5527 add maskq, 32 5528.w64: 5529 vpermq m0, m0, q3120 5530 mova [dstq+32*0], m0 5531 W_MASK 0, 5, 2, 3 5532 packuswb m4, m5 5533 psubb m5, m8, m4 5534 pavgb m5, m9 5535 vpermq m0, m0, q3120 5536 vpermd m5, m10, m5 5537 mova [dstq+32*1], m0 5538 mova [maskq], m5 5539 dec hd 5540 jg .w64_loop 5541 RET 5542.w128_loop: 5543 add tmp1q, 32*8 5544 add tmp2q, 32*8 5545 W_MASK 0, 4, 0, 1 5546 add dstq, strideq 5547 add maskq, 32*2 5548.w128: 5549 vpermq m0, m0, q3120 5550 mova [dstq+32*0], m0 5551 W_MASK 0, 5, 2, 3 5552 packuswb m4, m5 5553 psubb m5, m8, m4 5554 pavgb m5, m9 5555 vpermq m0, m0, q3120 5556 vpermd m5, m10, m5 5557 mova [dstq+32*1], m0 5558 mova [maskq+32*0], m5 5559 W_MASK 0, 4, 4, 5 5560 vpermq m0, m0, q3120 5561 mova [dstq+32*2], m0 5562 W_MASK 0, 5, 6, 7 5563 packuswb m4, m5 5564 psubb m5, m8, m4 5565 pavgb m5, m9 5566 vpermq m0, m0, q3120 5567 vpermd m5, m10, m5 5568 mova [dstq+32*3], m0 5569 mova [maskq+32*1], m5 5570 dec hd 5571 jg .w128_loop 5572 RET 5573 5574cglobal w_mask_444, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3 5575%define base r7-w_mask_444_avx2_table 5576 lea r7, [w_mask_444_avx2_table] 5577 tzcnt wd, wm 5578 movifnidn hd, hm 5579 mov maskq, maskmp 5580 movsxd wq, dword [r7+wq*4] 5581 vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 5582 vpbroadcastd m5, [base+pb_64] 5583 vpbroadcastd m7, [base+pw_2048] 5584 add wq, r7 5585 W_MASK 0, 4, 0, 1, 1 5586 lea stride3q, [strideq*3] 5587 jmp wq 5588.w4: 5589 vextracti128 xm1, m0, 1 5590 movd [dstq+strideq*0], xm0 5591 pextrd [dstq+strideq*1], xm0, 1 5592 movd [dstq+strideq*2], xm1 5593 pextrd [dstq+stride3q ], xm1, 1 5594 mova [maskq+32*0], m4 5595 cmp hd, 8 5596 jl .w4_end 5597 lea dstq, [dstq+strideq*4] 5598 pextrd [dstq+strideq*0], xm0, 2 5599 pextrd [dstq+strideq*1], xm0, 3 5600 pextrd [dstq+strideq*2], xm1, 2 5601 pextrd [dstq+stride3q ], xm1, 3 5602 je .w4_end 5603 W_MASK 0, 4, 2, 3, 1 5604 lea dstq, [dstq+strideq*4] 5605 vextracti128 xm1, m0, 1 5606 movd [dstq+strideq*0], xm0 5607 pextrd [dstq+strideq*1], xm0, 1 5608 movd [dstq+strideq*2], xm1 5609 pextrd [dstq+stride3q ], xm1, 1 5610 lea dstq, [dstq+strideq*4] 5611 pextrd [dstq+strideq*0], xm0, 2 5612 pextrd [dstq+strideq*1], xm0, 3 5613 pextrd [dstq+strideq*2], xm1, 2 5614 pextrd [dstq+stride3q ], xm1, 3 5615 mova [maskq+32*1], m4 5616.w4_end: 5617 RET 5618.w8_loop: 5619 add tmp1q, 32*2 5620 add tmp2q, 32*2 5621 W_MASK 0, 4, 0, 1, 1 5622 lea dstq, [dstq+strideq*4] 5623 add maskq, 32 5624.w8: 5625 vextracti128 xm1, m0, 1 5626 movq [dstq+strideq*0], xm0 5627 movq [dstq+strideq*1], xm1 5628 movhps [dstq+strideq*2], xm0 5629 movhps [dstq+stride3q ], xm1 5630 mova [maskq], m4 5631 sub hd, 4 5632 jg .w8_loop 5633 RET 5634.w16_loop: 5635 add tmp1q, 32*2 5636 add tmp2q, 32*2 5637 W_MASK 0, 4, 0, 1, 1 5638 lea dstq, [dstq+strideq*2] 5639 add maskq, 32 5640.w16: 5641 vpermq m0, m0, q3120 5642 mova [dstq+strideq*0], xm0 5643 vextracti128 [dstq+strideq*1], m0, 1 5644 mova [maskq], m4 5645 sub hd, 2 5646 jg .w16_loop 5647 RET 5648.w32_loop: 5649 add tmp1q, 32*2 5650 add tmp2q, 32*2 5651 W_MASK 0, 4, 0, 1, 1 5652 add dstq, strideq 5653 add maskq, 32 5654.w32: 5655 vpermq m0, m0, q3120 5656 mova [dstq], m0 5657 mova [maskq], m4 5658 dec hd 5659 jg .w32_loop 5660 RET 5661.w64_loop: 5662 add tmp1q, 32*4 5663 add tmp2q, 32*4 5664 W_MASK 0, 4, 0, 1, 1 5665 add dstq, strideq 5666 add maskq, 32*2 5667.w64: 5668 vpermq m0, m0, q3120 5669 mova [dstq+32*0], m0 5670 mova [maskq+32*0], m4 5671 W_MASK 0, 4, 2, 3, 1 5672 vpermq m0, m0, q3120 5673 mova [dstq+32*1], m0 5674 mova [maskq+32*1], m4 5675 dec hd 5676 jg .w64_loop 5677 RET 5678.w128_loop: 5679 add tmp1q, 32*8 5680 add tmp2q, 32*8 5681 W_MASK 0, 4, 0, 1, 1 5682 add dstq, strideq 5683 add maskq, 32*4 5684.w128: 5685 vpermq m0, m0, q3120 5686 mova [dstq+32*0], m0 5687 mova [maskq+32*0], m4 5688 W_MASK 0, 4, 2, 3, 1 5689 vpermq m0, m0, q3120 5690 mova [dstq+32*1], m0 5691 mova [maskq+32*1], m4 5692 W_MASK 0, 4, 4, 5, 1 5693 vpermq m0, m0, q3120 5694 mova [dstq+32*2], m0 5695 mova [maskq+32*2], m4 5696 W_MASK 0, 4, 6, 7, 1 5697 vpermq m0, m0, q3120 5698 mova [dstq+32*3], m0 5699 mova [maskq+32*3], m4 5700 dec hd 5701 jg .w128_loop 5702 RET 5703 5704%endif ; ARCH_X86_64 5705