1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31%macro JMP_TABLE 2-* 32 %xdefine %1_jmptable %%table 33 %xdefine %%base mangle(private_prefix %+ _%1_avx2) 34 %%table: 35 %rep %0 - 1 36 dd %%base %+ .%2 - %%table 37 %rotate 1 38 %endrep 39%endmacro 40 41%macro CDEF_FILTER_JMP_TABLE 1 42JMP_TABLE cdef_filter_%1_8bpc, \ 43 d6k0, d6k1, d7k0, d7k1, \ 44 d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \ 45 d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \ 46 d0k0, d0k1, d1k0, d1k1 47%endmacro 48 49SECTION_RODATA 32 50 51pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 52blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00 53 dd 0x80, 0x00, 0x00 54blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 55blend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 56 dd 0x00, 0x00 57blend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 58 dd 0x0000 59blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 60 dd 0x0000, 0x0000 61blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80 62blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000 63div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105 64shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 65shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 66pw_128: times 2 dw 128 67pw_2048: times 2 dw 2048 68tap_table: ; masks for 8 bit shifts 69 db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 70 ; weights 71 db 4, 2, 3, 3, 2, 1 72 db -1 * 16 + 1, -2 * 16 + 2 73 db 0 * 16 + 1, -1 * 16 + 2 74 db 0 * 16 + 1, 0 * 16 + 2 75 db 0 * 16 + 1, 1 * 16 + 2 76 db 1 * 16 + 1, 2 * 16 + 2 77 db 1 * 16 + 0, 2 * 16 + 1 78 db 1 * 16 + 0, 2 * 16 + 0 79 db 1 * 16 + 0, 2 * 16 - 1 80 ; the last 6 are repeats of the first 6 so we don't need to & 7 81 db -1 * 16 + 1, -2 * 16 + 2 82 db 0 * 16 + 1, -1 * 16 + 2 83 db 0 * 16 + 1, 0 * 16 + 2 84 db 0 * 16 + 1, 1 * 16 + 2 85 db 1 * 16 + 1, 2 * 16 + 2 86 db 1 * 16 + 0, 2 * 16 + 1 87 88CDEF_FILTER_JMP_TABLE 4x4 89CDEF_FILTER_JMP_TABLE 4x8 90CDEF_FILTER_JMP_TABLE 8x8 91 92SECTION .text 93 94%macro PREP_REGS 2 ; w, h 95 ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] 96 mov dird, r6m 97 lea tableq, [cdef_filter_%1x%2_8bpc_jmptable] 98 lea dirq, [tableq+dirq*2*4] 99%if %1 == 4 100 %if %2 == 4 101 DEFINE_ARGS dst, stride, left, top, pri, sec, \ 102 table, dir, dirjmp, dst4, stride3, k 103 %else 104 DEFINE_ARGS dst, stride, left, top, pri, sec, \ 105 table, dir, dirjmp, dst4, dst8, stride3, k 106 lea dst8q, [dstq+strideq*8] 107 %endif 108%else 109 DEFINE_ARGS dst, stride, h, top1, pri, sec, \ 110 table, dir, dirjmp, top2, dst4, stride3, k 111 mov hq, -8 112 lea top1q, [top1q+strideq*0] 113 lea top2q, [top1q+strideq*1] 114%endif 115 lea dst4q, [dstq+strideq*4] 116%if %1 == 4 117 lea stride3q, [strideq*3] 118%endif 119%endmacro 120 121%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max 122 mov kd, 1 123 pxor m15, m15 ; sum 124%if %2 == 8 125 pxor m12, m12 126 %if %1 == 4 127 movd xm4, [dstq +strideq*0] 128 movd xm6, [dstq +strideq*1] 129 movd xm5, [dstq +strideq*2] 130 movd xm7, [dstq +stride3q ] 131 vinserti128 m4, [dst4q+strideq*0], 1 132 vinserti128 m6, [dst4q+strideq*1], 1 133 vinserti128 m5, [dst4q+strideq*2], 1 134 vinserti128 m7, [dst4q+stride3q ], 1 135 punpckldq m4, m6 136 punpckldq m5, m7 137 %else 138 movq xm4, [dstq+strideq*0] 139 movq xm5, [dstq+strideq*1] 140 vinserti128 m4, [dstq+strideq*2], 1 141 vinserti128 m5, [dstq+stride3q ], 1 142 %endif 143 punpcklqdq m4, m5 144%else 145 movd xm4, [dstq+strideq*0] 146 movd xm5, [dstq+strideq*1] 147 vinserti128 m4, [dstq+strideq*2], 1 148 vinserti128 m5, [dstq+stride3q ], 1 149 punpckldq m4, m5 150%endif 151%if %3 == 1 152 mova m7, m4 ; min 153 mova m8, m4 ; max 154%endif 155%endmacro 156 157%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength 158 ; mul_tap, w, h, clip 159 ; load p0/p1 160 movsxd dirjmpq, [dirq+kq*4+%1*2*4] 161 add dirjmpq, tableq 162 call dirjmpq 163 164%if %8 == 1 165 pmaxub m7, m5 166 pminub m8, m5 167 pmaxub m7, m6 168 pminub m8, m6 169%endif 170 171 ; accumulate sum[m15] over p0/p1 172%if %7 == 4 173 punpcklbw m5, m6 174 punpcklbw m6, m4, m4 175 psubusb m9, m5, m6 176 psubusb m5, m6, m5 177 por m9, m5 ; abs_diff_p01(p01 - px) 178 pcmpeqb m5, m9 179 por m5, %5 180 psignb m6, %5, m5 181 psrlw m5, m9, %2 ; emulate 8-bit shift 182 pand m5, %3 183 psubusb m5, %4, m5 184 pminub m5, m9 185 pmaddubsw m5, m6 186 paddw m15, m5 187%else 188 psubusb m9, m5, m4 189 psubusb m5, m4, m5 190 psubusb m11, m6, m4 191 psubusb m6, m4, m6 192 por m9, m5 ; abs_diff_p0(p0 - px) 193 por m11, m6 ; abs_diff_p1(p1 - px) 194 pcmpeqb m5, m9 195 pcmpeqb m6, m11 196 punpckhbw m10, m9, m11 197 punpcklbw m9, m11 198 por m5, %5 199 por m11, m6, %5 200 punpckhbw m6, m5, m11 201 punpcklbw m5, m11 202 psignb m11, %5, m6 203 psrlw m6, m10, %2 ; emulate 8-bit shift 204 pand m6, %3 205 psubusb m6, %4, m6 206 pminub m6, m10 207 pmaddubsw m6, m11 208 paddw m12, m6 209 psignb m11, %5, m5 210 psrlw m5, m9, %2 ; emulate 8-bit shift 211 pand m5, %3 212 psubusb m5, %4, m5 213 pminub m5, m9 214 pmaddubsw m5, m11 215 paddw m15, m5 216%endif 217%endmacro 218 219%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip 220%if %2 == 4 221 %if %5 == 1 222 punpcklbw m4, %3 223 %endif 224 pcmpgtw %3, m15 225 paddw m15, %3 226 pmulhrsw m15, %4 227 %if %5 == 0 228 packsswb m15, m15 229 paddb m4, m15 230 %else 231 paddw m4, m15 232 packuswb m4, m4 ; clip px in [0x0,0xff] 233 pminub m4, m7 234 pmaxub m4, m8 235 %endif 236 vextracti128 xm5, m4, 1 237 movd [dstq+strideq*0], xm4 238 movd [dstq+strideq*2], xm5 239 pextrd [dstq+strideq*1], xm4, 1 240 pextrd [dstq+stride3q ], xm5, 1 241%else 242 pcmpgtw m6, %3, m12 243 pcmpgtw m5, %3, m15 244 paddw m12, m6 245 paddw m15, m5 246 %if %5 == 1 247 punpckhbw m5, m4, %3 248 punpcklbw m4, %3 249 %endif 250 pmulhrsw m12, %4 251 pmulhrsw m15, %4 252 %if %5 == 0 253 packsswb m15, m12 254 paddb m4, m15 255 %else 256 paddw m5, m12 257 paddw m4, m15 258 packuswb m4, m5 ; clip px in [0x0,0xff] 259 pminub m4, m7 260 pmaxub m4, m8 261 %endif 262 vextracti128 xm5, m4, 1 263 %if %1 == 4 264 movd [dstq +strideq*0], xm4 265 movd [dst4q+strideq*0], xm5 266 pextrd [dstq +strideq*1], xm4, 1 267 pextrd [dst4q+strideq*1], xm5, 1 268 pextrd [dstq +strideq*2], xm4, 2 269 pextrd [dst4q+strideq*2], xm5, 2 270 pextrd [dstq +stride3q ], xm4, 3 271 pextrd [dst4q+stride3q ], xm5, 3 272 %else 273 movq [dstq+strideq*0], xm4 274 movq [dstq+strideq*2], xm5 275 movhps [dstq+strideq*1], xm4 276 movhps [dstq+stride3q ], xm5 277 %endif 278%endif 279%endmacro 280 281%macro BORDER_PREP_REGS 2 ; w, h 282 ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] 283 mov dird, r6m 284 lea dirq, [tableq+dirq*2+14] 285%if %1*%2*2/mmsize > 1 286 %if %1 == 4 287 DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k 288 %else 289 DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k 290 %endif 291 mov hd, %1*%2*2/mmsize 292%else 293 DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k 294%endif 295 lea stkq, [px] 296 pxor m11, m11 297%endmacro 298 299%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max 300 mov kd, 1 301%if %1 == 4 302 movq xm4, [stkq+32*0] 303 movhps xm4, [stkq+32*1] 304 movq xm5, [stkq+32*2] 305 movhps xm5, [stkq+32*3] 306 vinserti128 m4, xm5, 1 307%else 308 mova xm4, [stkq+32*0] ; px 309 vinserti128 m4, [stkq+32*1], 1 310%endif 311 pxor m15, m15 ; sum 312%if %3 == 1 313 mova m7, m4 ; max 314 mova m8, m4 ; min 315%endif 316%endmacro 317 318%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength 319 ; mul_tap, w, clip 320 ; load p0/p1 321 movsx offq, byte [dirq+kq+%1] ; off1 322%if %6 == 4 323 movq xm5, [stkq+offq*2+32*0] ; p0 324 movq xm6, [stkq+offq*2+32*2] 325 movhps xm5, [stkq+offq*2+32*1] 326 movhps xm6, [stkq+offq*2+32*3] 327 vinserti128 m5, xm6, 1 328%else 329 movu xm5, [stkq+offq*2+32*0] ; p0 330 vinserti128 m5, [stkq+offq*2+32*1], 1 331%endif 332 neg offq ; -off1 333%if %6 == 4 334 movq xm6, [stkq+offq*2+32*0] ; p1 335 movq xm9, [stkq+offq*2+32*2] 336 movhps xm6, [stkq+offq*2+32*1] 337 movhps xm9, [stkq+offq*2+32*3] 338 vinserti128 m6, xm9, 1 339%else 340 movu xm6, [stkq+offq*2+32*0] ; p1 341 vinserti128 m6, [stkq+offq*2+32*1], 1 342%endif 343%if %7 == 1 344 ; out of bounds values are set to a value that is a both a large unsigned 345 ; value and a negative signed value. 346 ; use signed max and unsigned min to remove them 347 pmaxsw m7, m5 ; max after p0 348 pminuw m8, m5 ; min after p0 349 pmaxsw m7, m6 ; max after p1 350 pminuw m8, m6 ; min after p1 351%endif 352 353 ; accumulate sum[m15] over p0/p1 354 ; calculate difference before converting 355 psubw m5, m4 ; diff_p0(p0 - px) 356 psubw m6, m4 ; diff_p1(p1 - px) 357 358 ; convert to 8-bits with signed saturation 359 ; saturating to large diffs has no impact on the results 360 packsswb m5, m6 361 362 ; group into pairs so we can accumulate using maddubsw 363 pshufb m5, m12 364 pabsb m9, m5 365 psignb m10, %5, m5 366 psrlw m5, m9, %2 ; emulate 8-bit shift 367 pand m5, %3 368 psubusb m5, %4, m5 369 370 ; use unsigned min since abs diff can equal 0x80 371 pminub m5, m9 372 pmaddubsw m5, m10 373 paddw m15, m5 374%endmacro 375 376%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip 377 pcmpgtw m9, m11, m15 378 paddw m15, m9 379 pmulhrsw m15, %2 380 paddw m4, m15 381%if %3 == 1 382 pminsw m4, m7 383 pmaxsw m4, m8 384%endif 385 packuswb m4, m4 386 vextracti128 xm5, m4, 1 387%if %1 == 4 388 movd [dstq+strideq*0], xm4 389 pextrd [dstq+strideq*1], xm4, 1 390 movd [dstq+strideq*2], xm5 391 pextrd [dstq+stride3q], xm5, 1 392%else 393 movq [dstq+strideq*0], xm4 394 movq [dstq+strideq*1], xm5 395%endif 396%endmacro 397 398%macro CDEF_FILTER 2 ; w, h 399INIT_YMM avx2 400cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \ 401 pri, sec, dir, damping, edge 402%assign stack_offset_entry stack_offset 403 mov edged, edgem 404 cmp edged, 0xf 405 jne .border_block 406 407 PUSH r9 408 PUSH r10 409 PUSH r11 410%if %2 == 4 411 %assign regs_used 12 412 %if STACK_ALIGNMENT < 32 413 PUSH r%+regs_used 414 %assign regs_used regs_used+1 415 %endif 416 ALLOC_STACK 0x60, 16 417 pmovzxbw xm0, [leftq+1] 418 vpermq m0, m0, q0110 419 psrldq m1, m0, 4 420 vpalignr m2, m0, m0, 12 421 movu [rsp+0x10], m0 422 movu [rsp+0x28], m1 423 movu [rsp+0x40], m2 424%elif %1 == 4 425 PUSH r12 426 %assign regs_used 13 427 %if STACK_ALIGNMENT < 32 428 PUSH r%+regs_used 429 %assign regs_used regs_used+1 430 %endif 431 ALLOC_STACK 8*2+%1*%2*1, 16 432 pmovzxwd m0, [leftq] 433 mova [rsp+0x10], m0 434%else 435 PUSH r12 436 PUSH r13 437 %assign regs_used 14 438 %if STACK_ALIGNMENT < 32 439 PUSH r%+regs_used 440 %assign regs_used regs_used+1 441 %endif 442 ALLOC_STACK 8*2+%1*%2*2+32, 16 443 lea r11, [strideq*3] 444 movu xm4, [dstq+strideq*2] 445 pmovzxwq m0, [leftq+0] 446 pmovzxwq m1, [leftq+8] 447 vinserti128 m4, [dstq+r11], 1 448 pmovzxbd m2, [leftq+1] 449 pmovzxbd m3, [leftq+9] 450 mova [rsp+0x10], m0 451 mova [rsp+0x30], m1 452 mova [rsp+0x50], m2 453 mova [rsp+0x70], m3 454 mova [rsp+0x90], m4 455%endif 456 457 DEFINE_ARGS dst, stride, left, top, pri, secdmp, zero, pridmp, damping 458 mov dampingd, r7m 459 xor zerod, zerod 460 movifnidn prid, prim 461 sub dampingd, 31 462 movifnidn secdmpd, secdmpm 463 test prid, prid 464 jz .sec_only 465 movd xm0, prid 466 lzcnt pridmpd, prid 467 add pridmpd, dampingd 468 cmovs pridmpd, zerod 469 mov [rsp+0], pridmpq ; pri_shift 470 test secdmpd, secdmpd 471 jz .pri_only 472 movd xm1, secdmpd 473 lzcnt secdmpd, secdmpd 474 add secdmpd, dampingd 475 cmovs secdmpd, zerod 476 mov [rsp+8], secdmpq ; sec_shift 477 478 DEFINE_ARGS dst, stride, left, top, pri, secdmp, table, pridmp 479 lea tableq, [tap_table] 480 vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask 481 vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask 482 483 ; pri/sec_taps[k] [4 total] 484 DEFINE_ARGS dst, stride, left, top, pri, sec, table, dir 485 vpbroadcastb m0, xm0 ; pri_strength 486 vpbroadcastb m1, xm1 ; sec_strength 487 and prid, 1 488 lea priq, [tableq+priq*2+8] ; pri_taps 489 lea secq, [tableq+12] ; sec_taps 490 491 PREP_REGS %1, %2 492%if %1*%2 > mmsize 493.v_loop: 494%endif 495 LOAD_BLOCK %1, %2, 1 496.k_loop: 497 vpbroadcastb m2, [priq+kq] ; pri_taps 498 vpbroadcastb m3, [secq+kq] ; sec_taps 499 ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0 500 ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2 501 ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2 502 dec kq 503 jge .k_loop 504 505 vpbroadcastd m10, [pw_2048] 506 pxor m9, m9 507 ADJUST_PIXEL %1, %2, m9, m10, 1 508%if %1*%2 > mmsize 509 mov dstq, dst4q 510 lea top1q, [rsp+0x90] 511 lea top2q, [rsp+0xA0] 512 lea dst4q, [dst4q+strideq*4] 513 add hq, 4 514 jl .v_loop 515%endif 516 RET 517 518.pri_only: 519 DEFINE_ARGS dst, stride, left, top, pri, _, table, pridmp 520 lea tableq, [tap_table] 521 vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask 522 ; pri/sec_taps[k] [4 total] 523 DEFINE_ARGS dst, stride, left, top, pri, _, table, dir 524 vpbroadcastb m0, xm0 ; pri_strength 525 and prid, 1 526 lea priq, [tableq+priq*2+8] ; pri_taps 527 PREP_REGS %1, %2 528 vpbroadcastd m3, [pw_2048] 529 pxor m1, m1 530%if %1*%2 > mmsize 531.pri_v_loop: 532%endif 533 LOAD_BLOCK %1, %2 534.pri_k_loop: 535 vpbroadcastb m2, [priq+kq] ; pri_taps 536 ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0 537 dec kq 538 jge .pri_k_loop 539 ADJUST_PIXEL %1, %2, m1, m3 540%if %1*%2 > mmsize 541 mov dstq, dst4q 542 lea top1q, [rsp+0x90] 543 lea top2q, [rsp+0xA0] 544 lea dst4q, [dst4q+strideq*4] 545 add hq, 4 546 jl .pri_v_loop 547%endif 548 RET 549 550.sec_only: 551 DEFINE_ARGS dst, stride, left, top, _, secdmp, zero, _, damping 552 movd xm1, secdmpd 553 lzcnt secdmpd, secdmpd 554 add secdmpd, dampingd 555 cmovs secdmpd, zerod 556 mov [rsp+8], secdmpq ; sec_shift 557 DEFINE_ARGS dst, stride, left, top, _, secdmp, table 558 lea tableq, [tap_table] 559 vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask 560 ; pri/sec_taps[k] [4 total] 561 DEFINE_ARGS dst, stride, left, top, _, sec, table, dir 562 vpbroadcastb m1, xm1 ; sec_strength 563 lea secq, [tableq+12] ; sec_taps 564 PREP_REGS %1, %2 565 vpbroadcastd m2, [pw_2048] 566 pxor m0, m0 567%if %1*%2 > mmsize 568.sec_v_loop: 569%endif 570 LOAD_BLOCK %1, %2 571.sec_k_loop: 572 vpbroadcastb m3, [secq+kq] ; sec_taps 573 ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2 574 ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2 575 dec kq 576 jge .sec_k_loop 577 ADJUST_PIXEL %1, %2, m0, m2 578%if %1*%2 > mmsize 579 mov dstq, dst4q 580 lea top1q, [rsp+0x90] 581 lea top2q, [rsp+0xA0] 582 lea dst4q, [dst4q+strideq*4] 583 add hq, 4 584 jl .sec_v_loop 585%endif 586 RET 587 588.d0k0: 589%if %1 == 4 590 %if %2 == 4 591 vpbroadcastq m6, [dstq+strideq*1-1] 592 vpbroadcastq m10, [dstq+strideq*2-1] 593 movd xm5, [topq+strideq*1+1] 594 movd xm9, [dstq+strideq*0+1] 595 psrldq m11, m6, 2 596 psrldq m12, m10, 2 597 vinserti128 m6, [dstq+stride3q -1], 1 598 vinserti128 m10, [dstq+strideq*4-1], 1 599 vpblendd m5, m11, 0x10 600 vpblendd m9, m12, 0x10 601 movu m11, [blend_4x4+16] 602 punpckldq m6, m10 603 punpckldq m5, m9 604 vpblendvb m6, [rsp+gprsize+0x28], m11 605 %else 606 movd xm5, [topq +strideq*1+1] 607 movq xm6, [dstq +strideq*1-1] 608 movq xm10, [dstq +stride3q -1] 609 movq xm11, [dst4q+strideq*1-1] 610 pinsrd xm5, [dstq +strideq*0+1], 1 611 movhps xm6, [dstq +strideq*2-1] 612 movhps xm10, [dst4q+strideq*0-1] 613 movhps xm11, [dst4q+strideq*2-1] 614 psrldq xm9, xm6, 2 615 shufps xm5, xm9, q2010 ; -1 +0 +1 +2 616 shufps xm6, xm10, q2020 ; +1 +2 +3 +4 617 psrldq xm9, xm11, 2 618 psrldq xm10, 2 619 shufps xm10, xm9, q2020 ; +3 +4 +5 +6 620 movd xm9, [dst4q+stride3q -1] 621 pinsrd xm9, [dst4q+strideq*4-1], 1 622 shufps xm11, xm9, q1020 ; +5 +6 +7 +8 623 pmovzxbw m9, [leftq+3] 624 vinserti128 m6, xm11, 1 625 movu m11, [blend_4x8_0+4] 626 vinserti128 m5, xm10, 1 627 vpblendvb m6, m9, m11 628 %endif 629%else 630 lea r13, [blend_8x8_0+16] 631 movq xm5, [top2q +1] 632 vbroadcasti128 m10, [dstq+strideq*1-1] 633 vbroadcasti128 m11, [dstq+strideq*2-1] 634 movhps xm5, [dstq+strideq*0+1] 635 vinserti128 m6, m10, [dstq+stride3q -1], 1 636 vinserti128 m9, m11, [dstq+strideq*4-1], 1 637 psrldq m10, 2 638 psrldq m11, 2 639 punpcklqdq m6, m9 640 movu m9, [r13+hq*2*1+16*1] 641 punpcklqdq m10, m11 642 vpblendd m5, m10, 0xF0 643 vpblendvb m6, [rsp+gprsize+80+hq*8+64+8*1], m9 644%endif 645 ret 646.d1k0: 647.d2k0: 648.d3k0: 649%if %1 == 4 650 %if %2 == 4 651 movq xm6, [dstq+strideq*0-1] 652 movq xm9, [dstq+strideq*1-1] 653 vinserti128 m6, [dstq+strideq*2-1], 1 654 vinserti128 m9, [dstq+stride3q -1], 1 655 movu m11, [rsp+gprsize+0x10] 656 pcmpeqd m12, m12 657 psrldq m5, m6, 2 658 psrldq m10, m9, 2 659 psrld m12, 24 660 punpckldq m6, m9 661 punpckldq m5, m10 662 vpblendvb m6, m11, m12 663 %else 664 movq xm6, [dstq +strideq*0-1] 665 movq xm9, [dstq +strideq*2-1] 666 movhps xm6, [dstq +strideq*1-1] 667 movhps xm9, [dstq +stride3q -1] 668 movq xm10, [dst4q+strideq*0-1] 669 movhps xm10, [dst4q+strideq*1-1] 670 psrldq xm5, xm6, 2 671 psrldq xm11, xm9, 2 672 shufps xm5, xm11, q2020 673 movq xm11, [dst4q+strideq*2-1] 674 movhps xm11, [dst4q+stride3q -1] 675 shufps xm6, xm9, q2020 676 shufps xm9, xm10, xm11, q2020 677 vinserti128 m6, xm9, 1 678 pmovzxbw m9, [leftq+1] 679 psrldq xm10, 2 680 psrldq xm11, 2 681 shufps xm10, xm11, q2020 682 vpbroadcastd m11, [blend_4x8_0+4] 683 vinserti128 m5, xm10, 1 684 vpblendvb m6, m9, m11 685 %endif 686%else 687 movu xm5, [dstq+strideq*0-1] 688 movu xm9, [dstq+strideq*1-1] 689 vinserti128 m5, [dstq+strideq*2-1], 1 690 vinserti128 m9, [dstq+stride3q -1], 1 691 movu m10, [blend_8x8_0+16] 692 punpcklqdq m6, m5, m9 693 vpblendvb m6, [rsp+gprsize+80+hq*8+64], m10 694 psrldq m5, 2 695 psrldq m9, 2 696 punpcklqdq m5, m9 697%endif 698 ret 699.d4k0: 700%if %1 == 4 701 %if %2 == 4 702 vpbroadcastq m10, [dstq+strideq*1-1] 703 vpbroadcastq m11, [dstq+strideq*2-1] 704 movd xm6, [topq+strideq*1-1] 705 movd xm9, [dstq+strideq*0-1] 706 psrldq m5, m10, 2 707 psrldq m12, m11, 2 708 vpblendd m6, m10, 0x10 709 vpblendd m9, m11, 0x10 710 movu m10, [blend_4x4] 711 vinserti128 m5, [dstq+stride3q +1], 1 712 vinserti128 m12, [dstq+strideq*4+1], 1 713 punpckldq m6, m9 714 punpckldq m5, m12 715 vpblendvb m6, [rsp+gprsize+0x40], m10 716 %else 717 movd xm6, [topq +strideq*1-1] 718 movq xm9, [dstq +strideq*1-1] 719 movq xm10, [dstq +stride3q -1] 720 movq xm11, [dst4q+strideq*1-1] 721 pinsrd xm6, [dstq +strideq*0-1], 1 722 movhps xm9, [dstq +strideq*2-1] 723 movhps xm10, [dst4q+strideq*0-1] 724 movhps xm11, [dst4q+strideq*2-1] 725 psrldq xm5, xm9, 2 726 shufps xm6, xm9, q2010 727 psrldq xm9, xm10, 2 728 shufps xm5, xm9, q2020 729 shufps xm10, xm11, q2020 730 movd xm9, [dst4q+stride3q +1] 731 vinserti128 m6, xm10, 1 732 pinsrd xm9, [dst4q+strideq*4+1], 1 733 psrldq xm11, 2 734 pmovzxbw m10, [leftq-1] 735 shufps xm11, xm9, q1020 736 movu m9, [blend_4x8_0] 737 vinserti128 m5, xm11, 1 738 vpblendvb m6, m10, m9 739 %endif 740%else 741 lea r13, [blend_8x8_0+8] 742 movq xm6, [top2q -1] 743 vbroadcasti128 m5, [dstq+strideq*1-1] 744 vbroadcasti128 m9, [dstq+strideq*2-1] 745 movhps xm6, [dstq+strideq*0-1] 746 movu m11, [r13+hq*2*1+16*1] 747 punpcklqdq m10, m5, m9 748 vinserti128 m5, [dstq+stride3q -1], 1 749 vinserti128 m9, [dstq+strideq*4-1], 1 750 vpblendd m6, m10, 0xF0 751 vpblendvb m6, [rsp+gprsize+80+hq*8+64-8*1], m11 752 psrldq m5, 2 753 psrldq m9, 2 754 punpcklqdq m5, m9 755%endif 756 ret 757.d5k0: 758.d6k0: 759.d7k0: 760%if %1 == 4 761 %if %2 == 4 762 movd xm6, [topq+strideq*1 ] 763 vpbroadcastd m5, [dstq+strideq*1 ] 764 vpbroadcastd m9, [dstq+strideq*2 ] 765 vpblendd xm6, [dstq+strideq*0-4], 0x2 766 vpblendd m5, m9, 0x22 767 vpblendd m6, m5, 0x30 768 vinserti128 m5, [dstq+stride3q ], 1 769 vpblendd m5, [dstq+strideq*4-20], 0x20 770 %else 771 movd xm6, [topq +strideq*1] 772 movd xm5, [dstq +strideq*1] 773 movd xm9, [dstq +stride3q ] 774 movd xm10, [dst4q+strideq*1] 775 movd xm11, [dst4q+stride3q ] 776 pinsrd xm6, [dstq +strideq*0], 1 777 pinsrd xm5, [dstq +strideq*2], 1 778 pinsrd xm9, [dst4q+strideq*0], 1 779 pinsrd xm10, [dst4q+strideq*2], 1 780 pinsrd xm11, [dst4q+strideq*4], 1 781 punpcklqdq xm6, xm5 782 punpcklqdq xm5, xm9 783 punpcklqdq xm9, xm10 784 punpcklqdq xm10, xm11 785 vinserti128 m6, xm9, 1 786 vinserti128 m5, xm10, 1 787 %endif 788%else 789 movq xm6, [top2q ] 790 movq xm5, [dstq+strideq*1] 791 movq xm9, [dstq+stride3q ] 792 movhps xm6, [dstq+strideq*0] 793 movhps xm5, [dstq+strideq*2] 794 movhps xm9, [dstq+strideq*4] 795 vinserti128 m6, xm5, 1 796 vinserti128 m5, xm9, 1 797%endif 798 ret 799.d0k1: 800%if %1 == 4 801 %if %2 == 4 802 movd xm6, [dstq +strideq*2-2] 803 movd xm9, [dstq +stride3q -2] 804 movd xm5, [topq +strideq*0+2] 805 movd xm10, [topq +strideq*1+2] 806 pinsrw xm6, [leftq+4], 0 807 pinsrw xm9, [leftq+6], 0 808 vinserti128 m5, [dstq +strideq*0+2], 1 809 vinserti128 m10, [dstq +strideq*1+2], 1 810 vinserti128 m6, [dst4q+strideq*0-2], 1 811 vinserti128 m9, [dst4q+strideq*1-2], 1 812 punpckldq m5, m10 813 punpckldq m6, m9 814 %else 815 movq xm6, [dstq +strideq*2-2] 816 movd xm10, [dst4q+strideq*2-2] 817 movd xm5, [topq +strideq*0+2] 818 movq xm9, [dst4q+strideq*0-2] 819 movhps xm6, [dstq +stride3q -2] 820 pinsrw xm10, [dst4q+stride3q ], 3 821 pinsrd xm5, [topq +strideq*1+2], 1 822 movhps xm9, [dst4q+strideq*1-2] 823 pinsrd xm10, [dst8q+strideq*0-2], 2 824 pinsrd xm5, [dstq +strideq*0+2], 2 825 pinsrd xm10, [dst8q+strideq*1-2], 3 826 pinsrd xm5, [dstq +strideq*1+2], 3 827 shufps xm11, xm6, xm9, q3131 828 shufps xm6, xm9, q2020 829 movu m9, [blend_4x8_3+8] 830 vinserti128 m6, xm10, 1 831 vinserti128 m5, xm11, 1 832 vpblendvb m6, [rsp+gprsize+16+8], m9 833 %endif 834%else 835 lea r13, [blend_8x8_1+16] 836 movq xm6, [dstq +strideq*2-2] 837 movq xm9, [dstq +stride3q -2] 838 movq xm5, [top1q +2] 839 movq xm10, [top2q +2] 840 movu m11, [r13+hq*2*2+16*2] 841 vinserti128 m6, [dst4q+strideq*0-2], 1 842 vinserti128 m9, [dst4q+strideq*1-2], 1 843 vinserti128 m5, [dstq +strideq*0+2], 1 844 vinserti128 m10, [dstq +strideq*1+2], 1 845 punpcklqdq m6, m9 846 punpcklqdq m5, m10 847 vpblendvb m6, [rsp+gprsize+16+hq*8+64+8*2], m11 848%endif 849 ret 850.d1k1: 851%if %1 == 4 852 %if %2 == 4 853 vpbroadcastq m6, [dstq+strideq*1-2] 854 vpbroadcastq m9, [dstq+strideq*2-2] 855 movd xm5, [topq+strideq*1+2] 856 movd xm10, [dstq+strideq*0+2] 857 psrldq m11, m6, 4 858 psrldq m12, m9, 4 859 vpblendd m5, m11, 0x10 860 movq xm11, [leftq+2] 861 vinserti128 m6, [dstq+stride3q -2], 1 862 punpckldq xm11, xm11 863 vpblendd m10, m12, 0x10 864 pcmpeqd m12, m12 865 pmovzxwd m11, xm11 866 psrld m12, 16 867 punpckldq m6, m9 868 vpbroadcastd m9, [dstq+strideq*4-2] 869 vpblendvb m6, m11, m12 870 punpckldq m5, m10 871 vpblendd m6, m9, 0x20 872 %else 873 movd xm5, [topq +strideq*1+2] 874 movq xm6, [dstq +strideq*1-2] 875 movq xm9, [dstq +stride3q -2] 876 movq xm10, [dst4q+strideq*1-2] 877 movd xm11, [dst4q+stride3q -2] 878 pinsrd xm5, [dstq +strideq*0+2], 1 879 movhps xm6, [dstq +strideq*2-2] 880 movhps xm9, [dst4q+strideq*0-2] 881 movhps xm10, [dst4q+strideq*2-2] 882 pinsrd xm11, [dst4q+strideq*4-2], 1 883 shufps xm5, xm6, q3110 884 shufps xm6, xm9, q2020 885 shufps xm9, xm10, q3131 886 shufps xm10, xm11, q1020 887 movu m11, [blend_4x8_2+4] 888 vinserti128 m6, xm10, 1 889 vinserti128 m5, xm9, 1 890 vpblendvb m6, [rsp+gprsize+16+4], m11 891 %endif 892%else 893 lea r13, [blend_8x8_1+16] 894 movq xm5, [top2q +2] 895 vbroadcasti128 m6, [dstq+strideq*1-2] 896 vbroadcasti128 m9, [dstq+strideq*2-2] 897 movhps xm5, [dstq+strideq*0+2] 898 shufps m10, m6, m9, q2121 899 vinserti128 m6, [dstq+stride3q -2], 1 900 vinserti128 m9, [dstq+strideq*4-2], 1 901 movu m11, [r13+hq*2*1+16*1] 902 vpblendd m5, m10, 0xF0 903 punpcklqdq m6, m9 904 vpblendvb m6, [rsp+gprsize+16+hq*8+64+8*1], m11 905%endif 906 ret 907.d2k1: 908%if %1 == 4 909 %if %2 == 4 910 movq xm11, [leftq] 911 movq xm6, [dstq+strideq*0-2] 912 movq xm9, [dstq+strideq*1-2] 913 vinserti128 m6, [dstq+strideq*2-2], 1 914 vinserti128 m9, [dstq+stride3q -2], 1 915 punpckldq xm11, xm11 916 psrldq m5, m6, 4 917 psrldq m10, m9, 4 918 pmovzxwd m11, xm11 919 punpckldq m6, m9 920 punpckldq m5, m10 921 pblendw m6, m11, 0x05 922 %else 923 movq xm5, [dstq +strideq*0-2] 924 movq xm9, [dstq +strideq*2-2] 925 movq xm10, [dst4q+strideq*0-2] 926 movq xm11, [dst4q+strideq*2-2] 927 movhps xm5, [dstq +strideq*1-2] 928 movhps xm9, [dstq +stride3q -2] 929 movhps xm10, [dst4q+strideq*1-2] 930 movhps xm11, [dst4q+stride3q -2] 931 shufps xm6, xm5, xm9, q2020 932 shufps xm5, xm9, q3131 933 shufps xm9, xm10, xm11, q2020 934 shufps xm10, xm11, q3131 935 pmovzxwd m11, [leftq] 936 vinserti128 m6, xm9, 1 937 vinserti128 m5, xm10, 1 938 pblendw m6, m11, 0x55 939 %endif 940%else 941 mova m11, [rsp+gprsize+16+hq*8+64] 942 movu xm5, [dstq+strideq*0-2] 943 movu xm9, [dstq+strideq*1-2] 944 vinserti128 m5, [dstq+strideq*2-2], 1 945 vinserti128 m9, [dstq+stride3q -2], 1 946 shufps m6, m5, m9, q1010 947 shufps m5, m9, q2121 948 pblendw m6, m11, 0x11 949%endif 950 ret 951.d3k1: 952%if %1 == 4 953 %if %2 == 4 954 vpbroadcastq m11, [dstq+strideq*1-2] 955 vpbroadcastq m12, [dstq+strideq*2-2] 956 movd xm6, [topq+strideq*1-2] 957 movd xm9, [dstq+strideq*0-2] 958 pblendw m11, [leftq-16+2], 0x01 959 pblendw m12, [leftq-16+4], 0x01 960 pinsrw xm9, [leftq- 0+0], 0 961 psrldq m5, m11, 4 962 psrldq m10, m12, 4 963 vinserti128 m5, [dstq+stride3q +2], 1 964 vinserti128 m10, [dstq+strideq*4+2], 1 965 vpblendd m6, m11, 0x10 966 vpblendd m9, m12, 0x10 967 punpckldq m6, m9 968 punpckldq m5, m10 969 %else 970 movd xm6, [topq +strideq*1-2] 971 movq xm5, [dstq +strideq*1-2] 972 movq xm9, [dstq +stride3q -2] 973 movq xm10, [dst4q+strideq*1-2] 974 movd xm11, [dst4q+stride3q +2] 975 pinsrw xm6, [dstq +strideq*0 ], 3 976 movhps xm5, [dstq +strideq*2-2] 977 movhps xm9, [dst4q+strideq*0-2] 978 movhps xm10, [dst4q+strideq*2-2] 979 pinsrd xm11, [dst4q+strideq*4+2], 1 980 shufps xm6, xm5, q2010 981 shufps xm5, xm9, q3131 982 shufps xm9, xm10, q2020 983 shufps xm10, xm11, q1031 984 movu m11, [blend_4x8_2] 985 vinserti128 m6, xm9, 1 986 vinserti128 m5, xm10, 1 987 vpblendvb m6, [rsp+gprsize+16-4], m11 988 %endif 989%else 990 lea r13, [blend_8x8_1+8] 991 movq xm6, [top2q -2] 992 vbroadcasti128 m5, [dstq+strideq*1-2] 993 vbroadcasti128 m10, [dstq+strideq*2-2] 994 movhps xm6, [dstq+strideq*0-2] 995 punpcklqdq m9, m5, m10 996 vinserti128 m5, [dstq+stride3q -2], 1 997 vinserti128 m10, [dstq+strideq*4-2], 1 998 movu m11, [r13+hq*2*1+16*1] 999 vpblendd m6, m9, 0xF0 1000 shufps m5, m10, q2121 1001 vpblendvb m6, [rsp+gprsize+16+hq*8+64-8*1], m11 1002%endif 1003 ret 1004.d4k1: 1005%if %1 == 4 1006 %if %2 == 4 1007 vinserti128 m6, [dstq +strideq*0-2], 1 1008 vinserti128 m9, [dstq +strideq*1-2], 1 1009 movd xm5, [dstq +strideq*2+2] 1010 movd xm10, [dstq +stride3q +2] 1011 pblendw m6, [leftq-16+0], 0x01 1012 pblendw m9, [leftq-16+2], 0x01 1013 vinserti128 m5, [dst4q+strideq*0+2], 1 1014 vinserti128 m10, [dst4q+strideq*1+2], 1 1015 vpblendd m6, [topq +strideq*0-2], 0x01 1016 vpblendd m9, [topq +strideq*1-2], 0x01 1017 punpckldq m5, m10 1018 punpckldq m6, m9 1019 %else 1020 movd xm6, [topq +strideq*0-2] 1021 movq xm5, [dstq +strideq*2-2] 1022 movq xm9, [dst4q+strideq*0-2] 1023 movd xm10, [dst4q+strideq*2+2] 1024 pinsrd xm6, [topq +strideq*1-2], 1 1025 movhps xm5, [dstq +stride3q -2] 1026 movhps xm9, [dst4q+strideq*1-2] 1027 pinsrd xm10, [dst4q+stride3q +2], 1 1028 pinsrd xm6, [dstq +strideq*0-2], 2 1029 pinsrd xm10, [dst8q+strideq*0+2], 2 1030 pinsrd xm6, [dstq +strideq*1-2], 3 1031 pinsrd xm10, [dst8q+strideq*1+2], 3 1032 shufps xm11, xm5, xm9, q2020 1033 shufps xm5, xm9, q3131 1034 movu m9, [blend_4x8_3] 1035 vinserti128 m6, xm11, 1 1036 vinserti128 m5, xm10, 1 1037 vpblendvb m6, [rsp+gprsize+16-8], m9 1038 %endif 1039%else 1040 lea r13, [blend_8x8_1] 1041 movu m11, [r13+hq*2*2+16*2] 1042 movq xm6, [top1q -2] 1043 movq xm9, [top2q -2] 1044 movq xm5, [dstq +strideq*2+2] 1045 movq xm10, [dstq +stride3q +2] 1046 vinserti128 m6, [dstq +strideq*0-2], 1 1047 vinserti128 m9, [dstq +strideq*1-2], 1 1048 vinserti128 m5, [dst4q+strideq*0+2], 1 1049 vinserti128 m10, [dst4q+strideq*1+2], 1 1050 punpcklqdq m6, m9 1051 vpblendvb m6, [rsp+gprsize+16+hq*8+64-8*2], m11 1052 punpcklqdq m5, m10 1053%endif 1054 ret 1055.d5k1: 1056%if %1 == 4 1057 %if %2 == 4 1058 movd xm6, [topq +strideq*0-1] 1059 movd xm9, [topq +strideq*1-1] 1060 movd xm5, [dstq +strideq*2+1] 1061 movd xm10, [dstq +stride3q +1] 1062 pcmpeqd m12, m12 1063 pmovzxbw m11, [leftq-8+1] 1064 psrld m12, 24 1065 vinserti128 m6, [dstq +strideq*0-1], 1 1066 vinserti128 m9, [dstq +strideq*1-1], 1 1067 vinserti128 m5, [dst4q+strideq*0+1], 1 1068 vinserti128 m10, [dst4q+strideq*1+1], 1 1069 punpckldq m6, m9 1070 pxor m9, m9 1071 vpblendd m12, m9, 0x0F 1072 punpckldq m5, m10 1073 vpblendvb m6, m11, m12 1074 %else 1075 movd xm6, [topq +strideq*0-1] 1076 movq xm5, [dstq +strideq*2-1] 1077 movq xm9, [dst4q+strideq*0-1] 1078 movd xm10, [dst4q+strideq*2+1] 1079 pinsrd xm6, [topq +strideq*1-1], 1 1080 movhps xm5, [dstq +stride3q -1] 1081 movhps xm9, [dst4q+strideq*1-1] 1082 pinsrd xm10, [dst4q+stride3q +1], 1 1083 pinsrd xm6, [dstq +strideq*0-1], 2 1084 pinsrd xm10, [dst8q+strideq*0+1], 2 1085 pinsrd xm6, [dstq +strideq*1-1], 3 1086 pinsrd xm10, [dst8q+strideq*1+1], 3 1087 shufps xm11, xm5, xm9, q2020 1088 vinserti128 m6, xm11, 1 1089 pmovzxbw m11, [leftq-3] 1090 psrldq xm5, 2 1091 psrldq xm9, 2 1092 shufps xm5, xm9, q2020 1093 movu m9, [blend_4x8_1] 1094 vinserti128 m5, xm10, 1 1095 vpblendvb m6, m11, m9 1096 %endif 1097%else 1098 lea r13, [blend_8x8_0] 1099 movu m11, [r13+hq*2*2+16*2] 1100 movq xm6, [top1q -1] 1101 movq xm9, [top2q -1] 1102 movq xm5, [dstq +strideq*2+1] 1103 movq xm10, [dstq +stride3q +1] 1104 vinserti128 m6, [dstq +strideq*0-1], 1 1105 vinserti128 m9, [dstq +strideq*1-1], 1 1106 vinserti128 m5, [dst4q+strideq*0+1], 1 1107 vinserti128 m10, [dst4q+strideq*1+1], 1 1108 punpcklqdq m6, m9 1109 punpcklqdq m5, m10 1110 vpblendvb m6, [rsp+gprsize+80+hq*8+64-8*2], m11 1111%endif 1112 ret 1113.d6k1: 1114%if %1 == 4 1115 %if %2 == 4 1116 movd xm6, [topq +strideq*0] 1117 movd xm9, [topq +strideq*1] 1118 movd xm5, [dstq +strideq*2] 1119 movd xm10, [dstq +stride3q ] 1120 vinserti128 m6, [dstq +strideq*0], 1 1121 vinserti128 m9, [dstq +strideq*1], 1 1122 vinserti128 m5, [dst4q+strideq*0], 1 1123 vinserti128 m10, [dst4q+strideq*1], 1 1124 punpckldq m6, m9 1125 punpckldq m5, m10 1126 %else 1127 movd xm5, [dstq +strideq*2] 1128 movd xm6, [topq +strideq*0] 1129 movd xm9, [dst4q+strideq*2] 1130 pinsrd xm5, [dstq +stride3q ], 1 1131 pinsrd xm6, [topq +strideq*1], 1 1132 pinsrd xm9, [dst4q+stride3q ], 1 1133 pinsrd xm5, [dst4q+strideq*0], 2 1134 pinsrd xm6, [dstq +strideq*0], 2 1135 pinsrd xm9, [dst8q+strideq*0], 2 1136 pinsrd xm5, [dst4q+strideq*1], 3 1137 pinsrd xm6, [dstq +strideq*1], 3 1138 pinsrd xm9, [dst8q+strideq*1], 3 1139 vinserti128 m6, xm5, 1 1140 vinserti128 m5, xm9, 1 1141 %endif 1142%else 1143 movq xm5, [dstq +strideq*2] 1144 movq xm9, [dst4q+strideq*0] 1145 movq xm6, [top1q ] 1146 movq xm10, [dstq +strideq*0] 1147 movhps xm5, [dstq +stride3q ] 1148 movhps xm9, [dst4q+strideq*1] 1149 movhps xm6, [top2q ] 1150 movhps xm10, [dstq +strideq*1] 1151 vinserti128 m5, xm9, 1 1152 vinserti128 m6, xm10, 1 1153%endif 1154 ret 1155.d7k1: 1156%if %1 == 4 1157 %if %2 == 4 1158 movd xm5, [dstq +strideq*2-1] 1159 movd xm9, [dstq +stride3q -1] 1160 movd xm6, [topq +strideq*0+1] 1161 movd xm10, [topq +strideq*1+1] 1162 pinsrb xm5, [leftq+ 5], 0 1163 pinsrb xm9, [leftq+ 7], 0 1164 vinserti128 m6, [dstq +strideq*0+1], 1 1165 vinserti128 m10, [dstq +strideq*1+1], 1 1166 vinserti128 m5, [dst4q+strideq*0-1], 1 1167 vinserti128 m9, [dst4q+strideq*1-1], 1 1168 punpckldq m6, m10 1169 punpckldq m5, m9 1170 %else 1171 movd xm6, [topq +strideq*0+1] 1172 movq xm9, [dstq +strideq*2-1] 1173 movq xm10, [dst4q+strideq*0-1] 1174 movd xm11, [dst4q+strideq*2-1] 1175 pinsrd xm6, [topq +strideq*1+1], 1 1176 movhps xm9, [dstq +stride3q -1] 1177 movhps xm10, [dst4q+strideq*1-1] 1178 pinsrd xm11, [dst4q+stride3q -1], 1 1179 pinsrd xm6, [dstq +strideq*0+1], 2 1180 pinsrd xm11, [dst8q+strideq*0-1], 2 1181 pinsrd xm6, [dstq +strideq*1+1], 3 1182 pinsrd xm11, [dst8q+strideq*1-1], 3 1183 shufps xm5, xm9, xm10, q2020 1184 vinserti128 m5, xm11, 1 1185 pmovzxbw m11, [leftq+5] 1186 psrldq xm9, 2 1187 psrldq xm10, 2 1188 shufps xm9, xm10, q2020 1189 movu m10, [blend_4x8_1+8] 1190 vinserti128 m6, xm9, 1 1191 vpblendvb m5, m11, m10 1192 %endif 1193%else 1194 lea r13, [blend_8x8_0+16] 1195 movq xm5, [dstq +strideq*2-1] 1196 movq xm9, [dst4q+strideq*0-1] 1197 movq xm6, [top1q +1] 1198 movq xm10, [dstq +strideq*0+1] 1199 movhps xm5, [dstq +stride3q -1] 1200 movhps xm9, [dst4q+strideq*1-1] 1201 movhps xm6, [top2q +1] 1202 movhps xm10, [dstq +strideq*1+1] 1203 movu m11, [r13+hq*2*2+16*2] 1204 vinserti128 m5, xm9, 1 1205 vinserti128 m6, xm10, 1 1206 vpblendvb m5, [rsp+gprsize+80+hq*8+64+8*2], m11 1207%endif 1208 ret 1209 1210.border_block: 1211 DEFINE_ARGS dst, stride, left, top, pri, sec, stride3, dst4, edge 1212%define rstk rsp 1213%assign stack_offset stack_offset_entry 1214%if %1 == 4 && %2 == 8 1215 PUSH r9 1216 %assign regs_used 10 1217%else 1218 %assign regs_used 9 1219%endif 1220%if STACK_ALIGNMENT < 32 1221 PUSH r%+regs_used 1222 %assign regs_used regs_used+1 1223%endif 1224 ALLOC_STACK 2*16+(%2+4)*32, 16 1225%define px rsp+2*16+2*32 1226 1227 pcmpeqw m14, m14 1228 psllw m14, 15 ; 0x8000 1229 1230 ; prepare pixel buffers - body/right 1231%if %1 == 4 1232 INIT_XMM avx2 1233%endif 1234%if %2 == 8 1235 lea dst4q, [dstq+strideq*4] 1236%endif 1237 lea stride3q, [strideq*3] 1238 test edgeb, 2 ; have_right 1239 jz .no_right 1240 pmovzxbw m1, [dstq+strideq*0] 1241 pmovzxbw m2, [dstq+strideq*1] 1242 pmovzxbw m3, [dstq+strideq*2] 1243 pmovzxbw m4, [dstq+stride3q] 1244 mova [px+0*32], m1 1245 mova [px+1*32], m2 1246 mova [px+2*32], m3 1247 mova [px+3*32], m4 1248%if %2 == 8 1249 pmovzxbw m1, [dst4q+strideq*0] 1250 pmovzxbw m2, [dst4q+strideq*1] 1251 pmovzxbw m3, [dst4q+strideq*2] 1252 pmovzxbw m4, [dst4q+stride3q] 1253 mova [px+4*32], m1 1254 mova [px+5*32], m2 1255 mova [px+6*32], m3 1256 mova [px+7*32], m4 1257%endif 1258 jmp .body_done 1259.no_right: 1260%if %1 == 4 1261 movd xm1, [dstq+strideq*0] 1262 movd xm2, [dstq+strideq*1] 1263 movd xm3, [dstq+strideq*2] 1264 movd xm4, [dstq+stride3q] 1265 pmovzxbw xm1, xm1 1266 pmovzxbw xm2, xm2 1267 pmovzxbw xm3, xm3 1268 pmovzxbw xm4, xm4 1269 movq [px+0*32], xm1 1270 movq [px+1*32], xm2 1271 movq [px+2*32], xm3 1272 movq [px+3*32], xm4 1273%else 1274 pmovzxbw xm1, [dstq+strideq*0] 1275 pmovzxbw xm2, [dstq+strideq*1] 1276 pmovzxbw xm3, [dstq+strideq*2] 1277 pmovzxbw xm4, [dstq+stride3q] 1278 mova [px+0*32], xm1 1279 mova [px+1*32], xm2 1280 mova [px+2*32], xm3 1281 mova [px+3*32], xm4 1282%endif 1283 movd [px+0*32+%1*2], xm14 1284 movd [px+1*32+%1*2], xm14 1285 movd [px+2*32+%1*2], xm14 1286 movd [px+3*32+%1*2], xm14 1287%if %2 == 8 1288 %if %1 == 4 1289 movd xm1, [dst4q+strideq*0] 1290 movd xm2, [dst4q+strideq*1] 1291 movd xm3, [dst4q+strideq*2] 1292 movd xm4, [dst4q+stride3q] 1293 pmovzxbw xm1, xm1 1294 pmovzxbw xm2, xm2 1295 pmovzxbw xm3, xm3 1296 pmovzxbw xm4, xm4 1297 movq [px+4*32], xm1 1298 movq [px+5*32], xm2 1299 movq [px+6*32], xm3 1300 movq [px+7*32], xm4 1301 %else 1302 pmovzxbw xm1, [dst4q+strideq*0] 1303 pmovzxbw xm2, [dst4q+strideq*1] 1304 pmovzxbw xm3, [dst4q+strideq*2] 1305 pmovzxbw xm4, [dst4q+stride3q] 1306 mova [px+4*32], xm1 1307 mova [px+5*32], xm2 1308 mova [px+6*32], xm3 1309 mova [px+7*32], xm4 1310 %endif 1311 movd [px+4*32+%1*2], xm14 1312 movd [px+5*32+%1*2], xm14 1313 movd [px+6*32+%1*2], xm14 1314 movd [px+7*32+%1*2], xm14 1315%endif 1316.body_done: 1317 1318 ; top 1319 test edgeb, 4 ; have_top 1320 jz .no_top 1321 test edgeb, 1 ; have_left 1322 jz .top_no_left 1323 test edgeb, 2 ; have_right 1324 jz .top_no_right 1325 pmovzxbw m1, [topq+strideq*0-(%1/2)] 1326 pmovzxbw m2, [topq+strideq*1-(%1/2)] 1327 movu [px-2*32-%1], m1 1328 movu [px-1*32-%1], m2 1329 jmp .top_done 1330.top_no_right: 1331 pmovzxbw m1, [topq+strideq*0-%1] 1332 pmovzxbw m2, [topq+strideq*1-%1] 1333 movu [px-2*32-%1*2], m1 1334 movu [px-1*32-%1*2], m2 1335 movd [px-2*32+%1*2], xm14 1336 movd [px-1*32+%1*2], xm14 1337 jmp .top_done 1338.top_no_left: 1339 test edgeb, 2 ; have_right 1340 jz .top_no_left_right 1341 pmovzxbw m1, [topq+strideq*0] 1342 pmovzxbw m2, [topq+strideq*1] 1343 mova [px-2*32+0], m1 1344 mova [px-1*32+0], m2 1345 movd [px-2*32-4], xm14 1346 movd [px-1*32-4], xm14 1347 jmp .top_done 1348.top_no_left_right: 1349%if %1 == 4 1350 movd xm1, [topq+strideq*0] 1351 pinsrd xm1, [topq+strideq*1], 1 1352 pmovzxbw xm1, xm1 1353 movq [px-2*32+0], xm1 1354 movhps [px-1*32+0], xm1 1355%else 1356 pmovzxbw xm1, [topq+strideq*0] 1357 pmovzxbw xm2, [topq+strideq*1] 1358 mova [px-2*32+0], xm1 1359 mova [px-1*32+0], xm2 1360%endif 1361 movd [px-2*32-4], xm14 1362 movd [px-1*32-4], xm14 1363 movd [px-2*32+%1*2], xm14 1364 movd [px-1*32+%1*2], xm14 1365 jmp .top_done 1366.no_top: 1367 movu [px-2*32-%1], m14 1368 movu [px-1*32-%1], m14 1369.top_done: 1370 1371 ; left 1372 test edgeb, 1 ; have_left 1373 jz .no_left 1374 pmovzxbw xm1, [leftq+ 0] 1375%if %2 == 8 1376 pmovzxbw xm2, [leftq+ 8] 1377%endif 1378 movd [px+0*32-4], xm1 1379 pextrd [px+1*32-4], xm1, 1 1380 pextrd [px+2*32-4], xm1, 2 1381 pextrd [px+3*32-4], xm1, 3 1382%if %2 == 8 1383 movd [px+4*32-4], xm2 1384 pextrd [px+5*32-4], xm2, 1 1385 pextrd [px+6*32-4], xm2, 2 1386 pextrd [px+7*32-4], xm2, 3 1387%endif 1388 jmp .left_done 1389.no_left: 1390 movd [px+0*32-4], xm14 1391 movd [px+1*32-4], xm14 1392 movd [px+2*32-4], xm14 1393 movd [px+3*32-4], xm14 1394%if %2 == 8 1395 movd [px+4*32-4], xm14 1396 movd [px+5*32-4], xm14 1397 movd [px+6*32-4], xm14 1398 movd [px+7*32-4], xm14 1399%endif 1400.left_done: 1401 1402 ; bottom 1403 DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge 1404 test edgeb, 8 ; have_bottom 1405 jz .no_bottom 1406 lea dst8q, [dstq+%2*strideq] 1407 test edgeb, 1 ; have_left 1408 jz .bottom_no_left 1409 test edgeb, 2 ; have_right 1410 jz .bottom_no_right 1411 pmovzxbw m1, [dst8q-(%1/2)] 1412 pmovzxbw m2, [dst8q+strideq-(%1/2)] 1413 movu [px+(%2+0)*32-%1], m1 1414 movu [px+(%2+1)*32-%1], m2 1415 jmp .bottom_done 1416.bottom_no_right: 1417 pmovzxbw m1, [dst8q-%1] 1418 pmovzxbw m2, [dst8q+strideq-%1] 1419 movu [px+(%2+0)*32-%1*2], m1 1420 movu [px+(%2+1)*32-%1*2], m2 1421%if %1 == 8 1422 movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu 1423%endif 1424 movd [px+(%2+0)*32+%1*2], xm14 1425 movd [px+(%2+1)*32+%1*2], xm14 1426 jmp .bottom_done 1427.bottom_no_left: 1428 test edgeb, 2 ; have_right 1429 jz .bottom_no_left_right 1430 pmovzxbw m1, [dst8q] 1431 pmovzxbw m2, [dst8q+strideq] 1432 mova [px+(%2+0)*32+0], m1 1433 mova [px+(%2+1)*32+0], m2 1434 movd [px+(%2+0)*32-4], xm14 1435 movd [px+(%2+1)*32-4], xm14 1436 jmp .bottom_done 1437.bottom_no_left_right: 1438%if %1 == 4 1439 movd xm1, [dst8q] 1440 pinsrd xm1, [dst8q+strideq], 1 1441 pmovzxbw xm1, xm1 1442 movq [px+(%2+0)*32+0], xm1 1443 movhps [px+(%2+1)*32+0], xm1 1444%else 1445 pmovzxbw xm1, [dst8q] 1446 pmovzxbw xm2, [dst8q+strideq] 1447 mova [px+(%2+0)*32+0], xm1 1448 mova [px+(%2+1)*32+0], xm2 1449%endif 1450 movd [px+(%2+0)*32-4], xm14 1451 movd [px+(%2+1)*32-4], xm14 1452 movd [px+(%2+0)*32+%1*2], xm14 1453 movd [px+(%2+1)*32+%1*2], xm14 1454 jmp .bottom_done 1455.no_bottom: 1456 movu [px+(%2+0)*32-%1], m14 1457 movu [px+(%2+1)*32-%1], m14 1458.bottom_done: 1459 1460 ; actual filter 1461 INIT_YMM avx2 1462 DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero 1463%undef edged 1464 ; register to shuffle values into after packing 1465 vbroadcasti128 m12, [shufb_lohi] 1466 1467 mov dampingd, r7m 1468 xor zerod, zerod 1469 movifnidn prid, prim 1470 sub dampingd, 31 1471 movifnidn secdmpd, secdmpm 1472 test prid, prid 1473 jz .border_sec_only 1474 movd xm0, prid 1475 lzcnt pridmpd, prid 1476 add pridmpd, dampingd 1477 cmovs pridmpd, zerod 1478 mov [rsp+0], pridmpq ; pri_shift 1479 test secdmpd, secdmpd 1480 jz .border_pri_only 1481 movd xm1, secdmpd 1482 lzcnt secdmpd, secdmpd 1483 add secdmpd, dampingd 1484 cmovs secdmpd, zerod 1485 mov [rsp+8], secdmpq ; sec_shift 1486 1487 DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3 1488 lea tableq, [tap_table] 1489 vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask 1490 vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask 1491 1492 ; pri/sec_taps[k] [4 total] 1493 DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3 1494 vpbroadcastb m0, xm0 ; pri_strength 1495 vpbroadcastb m1, xm1 ; sec_strength 1496 and prid, 1 1497 lea priq, [tableq+priq*2+8] ; pri_taps 1498 lea secq, [tableq+12] ; sec_taps 1499 1500 BORDER_PREP_REGS %1, %2 1501%if %1*%2*2/mmsize > 1 1502.border_v_loop: 1503%endif 1504 BORDER_LOAD_BLOCK %1, %2, 1 1505.border_k_loop: 1506 vpbroadcastb m2, [priq+kq] ; pri_taps 1507 vpbroadcastb m3, [secq+kq] ; sec_taps 1508 ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1 1509 ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1 1510 ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1 1511 dec kq 1512 jge .border_k_loop 1513 1514 vpbroadcastd m10, [pw_2048] 1515 BORDER_ADJUST_PIXEL %1, m10, 1 1516%if %1*%2*2/mmsize > 1 1517 %define vloop_lines (mmsize/(%1*2)) 1518 lea dstq, [dstq+strideq*vloop_lines] 1519 add stkq, 32*vloop_lines 1520 dec hd 1521 jg .border_v_loop 1522%endif 1523 RET 1524 1525.border_pri_only: 1526 DEFINE_ARGS dst, stride, pridmp, table, pri, _, stride3 1527 lea tableq, [tap_table] 1528 vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask 1529 DEFINE_ARGS dst, stride, dir, table, pri, _, stride3 1530 vpbroadcastb m0, xm0 ; pri_strength 1531 and prid, 1 1532 lea priq, [tableq+priq*2+8] ; pri_taps 1533 BORDER_PREP_REGS %1, %2 1534 vpbroadcastd m1, [pw_2048] 1535%if %1*%2*2/mmsize > 1 1536.border_pri_v_loop: 1537%endif 1538 BORDER_LOAD_BLOCK %1, %2 1539.border_pri_k_loop: 1540 vpbroadcastb m2, [priq+kq] ; pri_taps 1541 ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1 1542 dec kq 1543 jge .border_pri_k_loop 1544 BORDER_ADJUST_PIXEL %1, m1 1545%if %1*%2*2/mmsize > 1 1546 %define vloop_lines (mmsize/(%1*2)) 1547 lea dstq, [dstq+strideq*vloop_lines] 1548 add stkq, 32*vloop_lines 1549 dec hd 1550 jg .border_pri_v_loop 1551%endif 1552 RET 1553 1554.border_sec_only: 1555 DEFINE_ARGS dst, stride, _, damping, _, secdmp, stride3, zero 1556 movd xm1, secdmpd 1557 lzcnt secdmpd, secdmpd 1558 add secdmpd, dampingd 1559 cmovs secdmpd, zerod 1560 mov [rsp+8], secdmpq ; sec_shift 1561 DEFINE_ARGS dst, stride, _, table, _, secdmp, stride3 1562 lea tableq, [tap_table] 1563 vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask 1564 DEFINE_ARGS dst, stride, dir, table, _, sec, stride3 1565 vpbroadcastb m1, xm1 ; sec_strength 1566 lea secq, [tableq+12] ; sec_taps 1567 BORDER_PREP_REGS %1, %2 1568 vpbroadcastd m0, [pw_2048] 1569%if %1*%2*2/mmsize > 1 1570.border_sec_v_loop: 1571%endif 1572 BORDER_LOAD_BLOCK %1, %2 1573.border_sec_k_loop: 1574 vpbroadcastb m3, [secq+kq] ; sec_taps 1575 ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1 1576 ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1 1577 dec kq 1578 jge .border_sec_k_loop 1579 BORDER_ADJUST_PIXEL %1, m0 1580%if %1*%2*2/mmsize > 1 1581 %define vloop_lines (mmsize/(%1*2)) 1582 lea dstq, [dstq+strideq*vloop_lines] 1583 add stkq, 32*vloop_lines 1584 dec hd 1585 jg .border_sec_v_loop 1586%endif 1587 RET 1588%endmacro 1589 1590CDEF_FILTER 8, 8 1591CDEF_FILTER 4, 8 1592CDEF_FILTER 4, 4 1593 1594INIT_YMM avx2 1595cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3 1596 lea stride3q, [strideq*3] 1597 movq xm0, [srcq+strideq*0] 1598 movq xm1, [srcq+strideq*1] 1599 movq xm2, [srcq+strideq*2] 1600 movq xm3, [srcq+stride3q ] 1601 lea srcq, [srcq+strideq*4] 1602 vpbroadcastq m4, [srcq+stride3q ] 1603 vpbroadcastq m5, [srcq+strideq*2] 1604 vpblendd m0, m4, 0xf0 1605 vpblendd m1, m5, 0xf0 1606 vpbroadcastq m4, [srcq+strideq*1] 1607 vpbroadcastq m5, [srcq+strideq*0] 1608 vpblendd m2, m4, 0xf0 1609 vpblendd m3, m5, 0xf0 1610 pxor m4, m4 1611 punpcklbw m0, m4 1612 punpcklbw m1, m4 1613 punpcklbw m2, m4 1614 punpcklbw m3, m4 1615cglobal_label .main 1616 vpbroadcastd m4, [pw_128] 1617 PROLOGUE 3, 4, 15 1618 psubw m0, m4 1619 psubw m1, m4 1620 psubw m2, m4 1621 psubw m3, m4 1622 1623 ; shuffle registers to generate partial_sum_diag[0-1] together 1624 vperm2i128 m7, m0, m0, 0x01 1625 vperm2i128 m6, m1, m1, 0x01 1626 vperm2i128 m5, m2, m2, 0x01 1627 vperm2i128 m4, m3, m3, 0x01 1628 1629 ; start with partial_sum_hv[0-1] 1630 paddw m8, m0, m1 1631 paddw m9, m2, m3 1632 phaddw m10, m0, m1 1633 phaddw m11, m2, m3 1634 paddw m8, m9 1635 phaddw m10, m11 1636 vextracti128 xm9, m8, 1 1637 vextracti128 xm11, m10, 1 1638 paddw xm8, xm9 ; partial_sum_hv[1] 1639 phaddw xm10, xm11 ; partial_sum_hv[0] 1640 vinserti128 m8, xm10, 1 1641 vpbroadcastd m9, [div_table+44] 1642 pmaddwd m8, m8 1643 pmulld m8, m9 ; cost6[2a-d] | cost2[a-d] 1644 1645 ; create aggregates [lower half]: 1646 ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+ 1647 ; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0 1648 ; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+ 1649 ; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x 1650 ; and [upper half]: 1651 ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+ 1652 ; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567 1653 ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+ 1654 ; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx 1655 ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd 1656 1657 pslldq m9, m1, 2 1658 psrldq m10, m1, 14 1659 pslldq m11, m2, 4 1660 psrldq m12, m2, 12 1661 pslldq m13, m3, 6 1662 psrldq m14, m3, 10 1663 paddw m9, m11 1664 paddw m10, m12 1665 paddw m9, m13 1666 paddw m10, m14 1667 pslldq m11, m4, 8 1668 psrldq m12, m4, 8 1669 pslldq m13, m5, 10 1670 psrldq m14, m5, 6 1671 paddw m9, m11 1672 paddw m10, m12 1673 paddw m9, m13 1674 paddw m10, m14 1675 pslldq m11, m6, 12 1676 psrldq m12, m6, 4 1677 pslldq m13, m7, 14 1678 psrldq m14, m7, 2 1679 paddw m9, m11 1680 paddw m10, m12 1681 paddw m9, m13 1682 paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero] 1683 vbroadcasti128 m14, [shufw_6543210x] 1684 vbroadcasti128 m13, [div_table+16] 1685 vbroadcasti128 m12, [div_table+0] 1686 paddw m9, m0 ; partial_sum_diag[0/1][0-7] 1687 pshufb m10, m14 1688 punpckhwd m11, m9, m10 1689 punpcklwd m9, m10 1690 pmaddwd m11, m11 1691 pmaddwd m9, m9 1692 pmulld m11, m13 1693 pmulld m9, m12 1694 paddd m9, m11 ; cost0[a-d] | cost4[a-d] 1695 1696 ; merge horizontally and vertically for partial_sum_alt[0-3] 1697 paddw m10, m0, m1 1698 paddw m11, m2, m3 1699 paddw m12, m4, m5 1700 paddw m13, m6, m7 1701 phaddw m0, m4 1702 phaddw m1, m5 1703 phaddw m2, m6 1704 phaddw m3, m7 1705 1706 ; create aggregates [lower half]: 1707 ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234 1708 ; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx 1709 ; and [upper half]: 1710 ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567 1711 ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx 1712 ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd 1713 1714 pslldq m4, m11, 2 1715 psrldq m11, 14 1716 pslldq m5, m12, 4 1717 psrldq m12, 12 1718 pslldq m6, m13, 6 1719 psrldq m13, 10 1720 paddw m4, m10 1721 paddw m11, m12 1722 vpbroadcastd m12, [div_table+44] 1723 paddw m5, m6 1724 paddw m11, m13 ; partial_sum_alt[3/2] right 1725 vbroadcasti128 m13, [div_table+32] 1726 paddw m4, m5 ; partial_sum_alt[3/2] left 1727 pshuflw m5, m11, q3012 1728 punpckhwd m6, m11, m4 1729 punpcklwd m4, m5 1730 pmaddwd m6, m6 1731 pmaddwd m4, m4 1732 pmulld m6, m12 1733 pmulld m4, m13 1734 paddd m4, m6 ; cost7[a-d] | cost5[a-d] 1735 1736 ; create aggregates [lower half]: 1737 ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234 1738 ; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx 1739 ; and [upper half]: 1740 ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567 1741 ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx 1742 ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd 1743 1744 pslldq m5, m1, 2 1745 psrldq m1, 14 1746 pslldq m6, m2, 4 1747 psrldq m2, 12 1748 pslldq m7, m3, 6 1749 psrldq m3, 10 1750 paddw m5, m0 1751 paddw m1, m2 1752 paddw m6, m7 1753 paddw m1, m3 ; partial_sum_alt[0/1] right 1754 paddw m5, m6 ; partial_sum_alt[0/1] left 1755 pshuflw m0, m1, q3012 1756 punpckhwd m1, m5 1757 punpcklwd m5, m0 1758 pmaddwd m1, m1 1759 pmaddwd m5, m5 1760 pmulld m1, m12 1761 pmulld m5, m13 1762 paddd m5, m1 ; cost1[a-d] | cost3[a-d] 1763 1764 mova xm0, [pd_47130256+ 16] 1765 mova m1, [pd_47130256] 1766 phaddd m9, m8 1767 phaddd m5, m4 1768 phaddd m9, m5 1769 vpermd m0, m9 ; cost[0-3] 1770 vpermd m1, m9 ; cost[4-7] | cost[0-3] 1771 1772 ; now find the best cost 1773 pmaxsd xm2, xm0, xm1 1774 pshufd xm3, xm2, q1032 1775 pmaxsd xm2, xm3 1776 pshufd xm3, xm2, q2301 1777 pmaxsd xm2, xm3 ; best cost 1778 1779 ; find the idx using minpos 1780 ; make everything other than the best cost negative via subtraction 1781 ; find the min of unsigned 16-bit ints to sort out the negative values 1782 psubd xm4, xm1, xm2 1783 psubd xm3, xm0, xm2 1784 packssdw xm3, xm4 1785 phminposuw xm3, xm3 1786 1787 ; convert idx to 32-bits 1788 psrld xm3, 16 1789 movd eax, xm3 1790 1791 ; get idx^4 complement 1792 vpermd m3, m1 1793 psubd xm2, xm3 1794 psrld xm2, 10 1795 movd [varq], xm2 1796 RET 1797 1798%endif ; ARCH_X86_64 1799