1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; Copyright © 2019, VideoLabs 4; All rights reserved. 5; 6; Redistribution and use in source and binary forms, with or without 7; modification, are permitted provided that the following conditions are met: 8; 9; 1. Redistributions of source code must retain the above copyright notice, this 10; list of conditions and the following disclaimer. 11; 12; 2. Redistributions in binary form must reproduce the above copyright notice, 13; this list of conditions and the following disclaimer in the documentation 14; and/or other materials provided with the distribution. 15; 16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 27%include "config.asm" 28%include "ext/x86/x86inc.asm" 29 30SECTION_RODATA 16 31 32%macro DUP8 1-* 33 %rep %0 34 times 8 db %1 35 %rotate 1 36 %endrep 37%endmacro 38 39div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105 40 dd 420, 210, 140, 105, 105, 105, 105, 105 41div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210 42 dw 168, 168, 140, 140, 120, 120, 105, 105 43 dw 420, 420, 210, 210, 140, 140, 105, 105 44 dw 105, 105, 105, 105, 105, 105, 105, 105 45const shufw_6543210x, \ 46 db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 47shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 48pw_8: times 8 dw 8 49pw_128: times 8 dw 128 50pw_256: times 8 dw 256 51pw_2048: times 8 dw 2048 52pw_0x7FFF: times 8 dw 0x7FFF 53pw_0x8000: times 8 dw 0x8000 54tap_table: ; masks for 8-bit shift emulation 55 DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80 56 ; weights 57 DUP8 4, 2, 3, 3, 2, 1 58 ; taps indices 59 db -1 * 16 + 1, -2 * 16 + 2 60 db 0 * 16 + 1, -1 * 16 + 2 61 db 0 * 16 + 1, 0 * 16 + 2 62 db 0 * 16 + 1, 1 * 16 + 2 63 db 1 * 16 + 1, 2 * 16 + 2 64 db 1 * 16 + 0, 2 * 16 + 1 65 db 1 * 16 + 0, 2 * 16 + 0 66 db 1 * 16 + 0, 2 * 16 - 1 67 ; the last 6 are repeats of the first 6 so we don't need to & 7 68 db -1 * 16 + 1, -2 * 16 + 2 69 db 0 * 16 + 1, -1 * 16 + 2 70 db 0 * 16 + 1, 0 * 16 + 2 71 db 0 * 16 + 1, 1 * 16 + 2 72 db 1 * 16 + 1, 2 * 16 + 2 73 db 1 * 16 + 0, 2 * 16 + 1 74 75SECTION .text 76 77%macro movif32 2 78 %if ARCH_X86_32 79 mov %1, %2 80 %endif 81%endmacro 82 83%macro PMOVZXBW 2-3 0 ; %3 = half 84 %if cpuflag(sse4) && %3 == 0 85 pmovzxbw %1, %2 86 %else 87 %if %3 == 1 88 movd %1, %2 89 %else 90 movq %1, %2 91 %endif 92 punpcklbw %1, m7 93 %endif 94%endmacro 95 96%macro PSHUFB_0 2 97 %if cpuflag(ssse3) 98 pshufb %1, %2 99 %else 100 punpcklbw %1, %1 101 pshuflw %1, %1, q0000 102 punpcklqdq %1, %1 103 %endif 104%endmacro 105 106%macro MOVDDUP 2 107%if cpuflag(ssse3) 108 movddup %1, %2 109%else 110 movq %1, %2 111 punpcklqdq %1, %1 112%endif 113%endmacro 114 115%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax 116 ; load p0/p1 117 movsx offq, byte [dirq+kq+%1+14*8] ; off1 118 %if %6 == 4 119 movq m5, [stkq+offq*2+32*0] ; p0 120 movhps m5, [stkq+offq*2+32*1] 121 %else 122 movu m5, [stkq+offq*2+32*0] ; p0 123 %endif 124 neg offq ; -off1 125 %if %6 == 4 126 movq m6, [stkq+offq*2+32*0] ; p1 127 movhps m6, [stkq+offq*2+32*1] 128 %else 129 movu m6, [stkq+offq*2+32*0] ; p1 130 %endif 131 %if %7 132 %if cpuflag(sse4) 133 ; out of bounds values are set to a value that is a both a large unsigned 134 ; value and a negative signed value. 135 ; use signed max and unsigned min to remove them 136 pmaxsw m7, m5 137 pminuw m8, m5 138 pmaxsw m7, m6 139 pminuw m8, m6 140 %else 141 pcmpeqw m3, m14, m5 142 pminsw m8, m5 ; min after p0 143 pandn m3, m5 144 pmaxsw m7, m3 ; max after p0 145 pcmpeqw m3, m14, m6 146 pminsw m8, m6 ; min after p1 147 pandn m3, m6 148 pmaxsw m7, m3 ; max after p1 149 %endif 150 %endif 151 152 ; accumulate sum[m13] over p0/p1 153 psubw m5, m4 ; diff_p0(p0 - px) 154 psubw m6, m4 ; diff_p1(p1 - px) 155 packsswb m5, m6 ; convert pixel diff to 8-bit 156 %if cpuflag(ssse3) 157 pshufb m5, m13 ; group diffs p0 and p1 into pairs 158 pabsb m6, m5 159 psignb m3, %5, m5 160 %else 161 movlhps m6, m5 162 punpckhbw m6, m5 163 pxor m5, m5 164 pcmpgtb m5, m6 165 paddb m6, m5 166 pxor m6, m5 167 paddb m3, %5, m5 168 pxor m3, m5 169 %endif 170 pand m9, %3, m6 ; emulate 8-bit shift 171 psrlw m9, %2 172 psubusb m5, %4, m9 173 pminub m5, m6 ; constrain(diff_p) 174 %if cpuflag(ssse3) 175 pmaddubsw m5, m3 ; constrain(diff_p) * taps 176 %else 177 psrlw m9, m5, 8 178 psraw m6, m3, 8 179 psllw m5, 8 180 psllw m3, 8 181 pmullw m9, m6 182 pmulhw m5, m3 183 paddw m5, m9 184 %endif 185 paddw m0, m5 186%endmacro 187 188%macro LOAD_BODY 3 ; dst, src, block_width 189 %if %3 == 4 190 PMOVZXBW m0, [%2+strideq*0] 191 PMOVZXBW m1, [%2+strideq*1] 192 PMOVZXBW m2, [%2+strideq*2] 193 PMOVZXBW m3, [%2+stride3q] 194 mova [%1+32*0], m0 195 mova [%1+32*1], m1 196 mova [%1+32*2], m2 197 mova [%1+32*3], m3 198 %else 199 movu m0, [%2+strideq*0] 200 movu m1, [%2+strideq*1] 201 movu m2, [%2+strideq*2] 202 movu m3, [%2+stride3q] 203 punpcklbw m4, m0, m7 204 punpckhbw m0, m7 205 mova [%1+32*0+ 0], m4 206 mova [%1+32*0+16], m0 207 punpcklbw m4, m1, m7 208 punpckhbw m1, m7 209 mova [%1+32*1+ 0], m4 210 mova [%1+32*1+16], m1 211 punpcklbw m4, m2, m7 212 punpckhbw m2, m7 213 mova [%1+32*2+ 0], m4 214 mova [%1+32*2+16], m2 215 punpcklbw m4, m3, m7 216 punpckhbw m3, m7 217 mova [%1+32*3+ 0], m4 218 mova [%1+32*3+16], m3 219 %endif 220%endmacro 221 222%macro CDEF_FILTER_END 2 ; w, minmax 223 pxor m6, m6 224 pcmpgtw m6, m0 225 paddw m0, m6 226 %if cpuflag(ssse3) 227 pmulhrsw m0, m15 228 %else 229 paddw m0, m15 230 psraw m0, 4 231 %endif 232 paddw m4, m0 233 %if %2 234 pminsw m4, m7 235 pmaxsw m4, m8 236 %endif 237 packuswb m4, m4 238 %if %1 == 4 239 movd [dstq+strideq*0], m4 240 psrlq m4, 32 241 movd [dstq+strideq*1], m4 242 add stkq, 32*2 243 lea dstq, [dstq+strideq*2] 244 %else 245 movq [dstq], m4 246 add stkq, 32 247 add dstq, strideq 248 %endif 249%endmacro 250 251%macro CDEF_FILTER 2 ; w, h 252 %if ARCH_X86_64 253cglobal cdef_filter_%1x%2_8bpc, 4, 9, 16, 3 * 16 + (%2+4)*32, \ 254 dst, stride, left, top, pri, sec, edge, stride3, dst4 255 %define px rsp+3*16+2*32 256 %define base 0 257 %else 258cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \ 259 dst, stride, left, edge, stride3 260 %define topq r2 261 %define dst4q r2 262 LEA r5, tap_table 263 %define px esp+7*16+2*32 264 %define base r5-tap_table 265 %endif 266 mov edged, r8m 267 %if cpuflag(sse4) 268 %define OUT_OF_BOUNDS_MEM [base+pw_0x8000] 269 %else 270 %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF] 271 %endif 272 mova m6, OUT_OF_BOUNDS_MEM 273 pxor m7, m7 274 275 ; prepare pixel buffers - body/right 276 %if %2 == 8 277 lea dst4q, [dstq+strideq*4] 278 %endif 279 lea stride3q, [strideq*3] 280 test edgeb, 2 ; have_right 281 jz .no_right 282 LOAD_BODY px, dstq, %1 283 %if %2 == 8 284 LOAD_BODY px+4*32, dst4q, %1 285 %endif 286 jmp .body_done 287.no_right: 288 PMOVZXBW m0, [dstq+strideq*0], %1 == 4 289 PMOVZXBW m1, [dstq+strideq*1], %1 == 4 290 PMOVZXBW m2, [dstq+strideq*2], %1 == 4 291 PMOVZXBW m3, [dstq+stride3q ], %1 == 4 292 mova [px+32*0], m0 293 mova [px+32*1], m1 294 mova [px+32*2], m2 295 mova [px+32*3], m3 296 movd [px+32*0+%1*2], m6 297 movd [px+32*1+%1*2], m6 298 movd [px+32*2+%1*2], m6 299 movd [px+32*3+%1*2], m6 300 %if %2 == 8 301 PMOVZXBW m0, [dst4q+strideq*0], %1 == 4 302 PMOVZXBW m1, [dst4q+strideq*1], %1 == 4 303 PMOVZXBW m2, [dst4q+strideq*2], %1 == 4 304 PMOVZXBW m3, [dst4q+stride3q ], %1 == 4 305 mova [px+32*4], m0 306 mova [px+32*5], m1 307 mova [px+32*6], m2 308 mova [px+32*7], m3 309 movd [px+32*4+%1*2], m6 310 movd [px+32*5+%1*2], m6 311 movd [px+32*6+%1*2], m6 312 movd [px+32*7+%1*2], m6 313 %endif 314.body_done: 315 316 ; top 317 movifnidn topq, r3mp 318 test edgeb, 4 ; have_top 319 jz .no_top 320 test edgeb, 1 ; have_left 321 jz .top_no_left 322 test edgeb, 2 ; have_right 323 jz .top_no_right 324 %if %1 == 4 325 PMOVZXBW m0, [topq+strideq*0-2] 326 PMOVZXBW m1, [topq+strideq*1-2] 327 %else 328 movu m0, [topq+strideq*0-4] 329 movu m1, [topq+strideq*1-4] 330 punpckhbw m2, m0, m7 331 punpcklbw m0, m7 332 punpckhbw m3, m1, m7 333 punpcklbw m1, m7 334 movu [px-32*2+8], m2 335 movu [px-32*1+8], m3 336 %endif 337 movu [px-32*2-%1], m0 338 movu [px-32*1-%1], m1 339 jmp .top_done 340.top_no_right: 341 %if %1 == 4 342 PMOVZXBW m0, [topq+strideq*0-%1] 343 PMOVZXBW m1, [topq+strideq*1-%1] 344 movu [px-32*2-8], m0 345 movu [px-32*1-8], m1 346 %else 347 movu m0, [topq+strideq*0-%1] 348 movu m1, [topq+strideq*1-%2] 349 punpckhbw m2, m0, m7 350 punpcklbw m0, m7 351 punpckhbw m3, m1, m7 352 punpcklbw m1, m7 353 mova [px-32*2-16], m0 354 mova [px-32*2+ 0], m2 355 mova [px-32*1-16], m1 356 mova [px-32*1+ 0], m3 357 %endif 358 movd [px-32*2+%1*2], m6 359 movd [px-32*1+%1*2], m6 360 jmp .top_done 361.top_no_left: 362 test edgeb, 2 ; have_right 363 jz .top_no_left_right 364 %if %1 == 4 365 PMOVZXBW m0, [topq+strideq*0] 366 PMOVZXBW m1, [topq+strideq*1] 367 %else 368 movu m0, [topq+strideq*0] 369 movu m1, [topq+strideq*1] 370 punpckhbw m2, m0, m7 371 punpcklbw m0, m7 372 punpckhbw m3, m1, m7 373 punpcklbw m1, m7 374 movd [px-32*2+16], m2 375 movd [px-32*1+16], m3 376 %endif 377 movd [px-32*2- 4], m6 378 movd [px-32*1- 4], m6 379 mova [px-32*2+ 0], m0 380 mova [px-32*1+ 0], m1 381 jmp .top_done 382.top_no_left_right: 383 PMOVZXBW m0, [topq+strideq*0], %1 == 4 384 PMOVZXBW m1, [topq+strideq*1], %1 == 4 385 movd [px-32*2-4], m6 386 movd [px-32*1-4], m6 387 mova [px-32*2+0], m0 388 mova [px-32*1+0], m1 389 movd [px-32*2+%1*2], m6 390 movd [px-32*1+%1*2], m6 391 jmp .top_done 392.no_top: 393 movu [px-32*2- 4], m6 394 movu [px-32*1- 4], m6 395 %if %1 == 8 396 movq [px-32*2+12], m6 397 movq [px-32*1+12], m6 398 %endif 399.top_done: 400 401 ; left 402 test edgeb, 1 ; have_left 403 jz .no_left 404 movifnidn leftq, leftmp 405 %if %2 == 4 406 movq m0, [leftq] 407 %else 408 movu m0, [leftq] 409 %endif 410 %if %2 == 4 411 punpcklbw m0, m7 412 %else 413 punpckhbw m1, m0, m7 414 punpcklbw m0, m7 415 movhlps m3, m1 416 movd [px+32*4-4], m1 417 movd [px+32*6-4], m3 418 psrlq m1, 32 419 psrlq m3, 32 420 movd [px+32*5-4], m1 421 movd [px+32*7-4], m3 422 %endif 423 movhlps m2, m0 424 movd [px+32*0-4], m0 425 movd [px+32*2-4], m2 426 psrlq m0, 32 427 psrlq m2, 32 428 movd [px+32*1-4], m0 429 movd [px+32*3-4], m2 430 jmp .left_done 431.no_left: 432 movd [px+32*0-4], m6 433 movd [px+32*1-4], m6 434 movd [px+32*2-4], m6 435 movd [px+32*3-4], m6 436 %if %2 == 8 437 movd [px+32*4-4], m6 438 movd [px+32*5-4], m6 439 movd [px+32*6-4], m6 440 movd [px+32*7-4], m6 441 %endif 442.left_done: 443 444 ; bottom 445 %if ARCH_X86_64 446 DEFINE_ARGS dst, stride, dst8, dummy, pri, sec, edge, stride3 447 %else 448 DEFINE_ARGS dst, stride, dst8, edge, stride3 449 %endif 450 test edgeb, 8 ; have_bottom 451 jz .no_bottom 452 lea dst8q, [dstq+%2*strideq] 453 test edgeb, 1 ; have_left 454 jz .bottom_no_left 455 test edgeb, 2 ; have_right 456 jz .bottom_no_right 457 %if %1 == 4 458 PMOVZXBW m0, [dst8q-(%1/2)] 459 PMOVZXBW m1, [dst8q+strideq-(%1/2)] 460 %else 461 movu m0, [dst8q-4] 462 movu m1, [dst8q+strideq-4] 463 punpckhbw m2, m0, m7 464 punpcklbw m0, m7 465 punpckhbw m3, m1, m7 466 punpcklbw m1, m7 467 movu [px+32*(%2+0)+8], m2 468 movu [px+32*(%2+1)+8], m3 469 %endif 470 movu [px+32*(%2+0)-%1], m0 471 movu [px+32*(%2+1)-%1], m1 472 jmp .bottom_done 473.bottom_no_right: 474 %if %1 == 4 475 PMOVZXBW m0, [dst8q-4] 476 PMOVZXBW m1, [dst8q+strideq-4] 477 movu [px+32*(%2+0)-8], m0 478 movu [px+32*(%2+1)-8], m1 479 %else 480 movu m0, [dst8q-8] 481 movu m1, [dst8q+strideq-8] 482 punpckhbw m2, m0, m7 483 punpcklbw m0, m7 484 punpckhbw m3, m1, m7 485 punpcklbw m1, m7 486 mova [px+32*(%2+0)-16], m0 487 mova [px+32*(%2+0)+ 0], m2 488 mova [px+32*(%2+1)-16], m1 489 mova [px+32*(%2+1)+ 0], m3 490 movd [px+32*(%2-1)+16], m6 ; overwritten by first mova 491 %endif 492 movd [px+32*(%2+0)+%1*2], m6 493 movd [px+32*(%2+1)+%1*2], m6 494 jmp .bottom_done 495.bottom_no_left: 496 test edgeb, 2 ; have_right 497 jz .bottom_no_left_right 498 %if %1 == 4 499 PMOVZXBW m0, [dst8q] 500 PMOVZXBW m1, [dst8q+strideq] 501 %else 502 movu m0, [dst8q] 503 movu m1, [dst8q+strideq] 504 punpckhbw m2, m0, m7 505 punpcklbw m0, m7 506 punpckhbw m3, m1, m7 507 punpcklbw m1, m7 508 mova [px+32*(%2+0)+16], m2 509 mova [px+32*(%2+1)+16], m3 510 %endif 511 mova [px+32*(%2+0)+ 0], m0 512 mova [px+32*(%2+1)+ 0], m1 513 movd [px+32*(%2+0)- 4], m6 514 movd [px+32*(%2+1)- 4], m6 515 jmp .bottom_done 516.bottom_no_left_right: 517 PMOVZXBW m0, [dst8q+strideq*0], %1 == 4 518 PMOVZXBW m1, [dst8q+strideq*1], %1 == 4 519 mova [px+32*(%2+0)+ 0], m0 520 mova [px+32*(%2+1)+ 0], m1 521 movd [px+32*(%2+0)+%1*2], m6 522 movd [px+32*(%2+1)+%1*2], m6 523 movd [px+32*(%2+0)- 4], m6 524 movd [px+32*(%2+1)- 4], m6 525 jmp .bottom_done 526.no_bottom: 527 movu [px+32*(%2+0)- 4], m6 528 movu [px+32*(%2+1)- 4], m6 529 %if %1 == 8 530 movq [px+32*(%2+0)+12], m6 531 movq [px+32*(%2+1)+12], m6 532 %endif 533.bottom_done: 534 535 ; actual filter 536 %if ARCH_X86_64 537 DEFINE_ARGS dst, stride, pridmp, damping, pri, sec 538 mova m13, [shufb_lohi] 539 %if cpuflag(ssse3) 540 mova m15, [pw_2048] 541 %else 542 mova m15, [pw_8] 543 %endif 544 mova m14, m6 545 %else 546 DEFINE_ARGS dst, pridmp, sec, damping, pri, tap 547 %xdefine m8 m1 548 %xdefine m9 m2 549 %xdefine m10 m0 550 %xdefine m13 [base+shufb_lohi] 551 %xdefine m14 OUT_OF_BOUNDS_MEM 552 %if cpuflag(ssse3) 553 %xdefine m15 [base+pw_2048] 554 %else 555 %xdefine m15 [base+pw_8] 556 %endif 557 %endif 558 movifnidn prid, r4m 559 movifnidn secd, r5m 560 mov dampingd, r7m 561 movif32 [esp+0x3C], r1d 562 test prid, prid 563 jz .sec_only 564 movd m1, prim 565 bsr pridmpd, prid 566 test secd, secd 567 jz .pri_only 568 movd m10, r5m 569 bsr secd, secd 570 and prid, 1 571 sub pridmpd, dampingd 572 sub secd, dampingd 573 xor dampingd, dampingd 574 add prid, prid 575 neg pridmpd 576 cmovs pridmpd, dampingd 577 neg secd 578 cmovs secd, dampingd 579 PSHUFB_0 m1, m7 580 PSHUFB_0 m10, m7 581 %if ARCH_X86_64 582 DEFINE_ARGS dst, stride, pridmp, tap, pri, sec 583 lea tapq, [tap_table] 584 MOVDDUP m11, [tapq+pridmpq*8] ; pri_shift_mask 585 MOVDDUP m12, [tapq+secq*8] ; sec_shift_mask 586 mov [rsp+0x00], pridmpq ; pri_shift 587 mov [rsp+0x10], secq ; sec_shift 588 DEFINE_ARGS dst, stride, dir, tap, pri, stk, k, off, h 589 %else 590 MOVDDUP m2, [tapq+pridmpq*8] 591 MOVDDUP m3, [tapq+secq*8] 592 mov [esp+0x04], dampingd ; zero upper 32 bits of psrlw 593 mov [esp+0x34], dampingd ; source operand in ACCUMULATE_TAP 594 mov [esp+0x00], pridmpd 595 mov [esp+0x30], secd 596 DEFINE_ARGS dst, stride, dir, stk, pri, tap, h 597 %define offq dstq 598 %define kd strided 599 %define kq strideq 600 mova [esp+0x10], m2 601 mova [esp+0x40], m3 602 mova [esp+0x20], m1 603 mova [esp+0x50], m10 604 %endif 605 mov dird, r6m 606 lea stkq, [px] 607 lea priq, [tapq+8*8+priq*8] ; pri_taps 608 mov hd, %1*%2/8 609 lea dirq, [tapq+dirq*2] 610.v_loop: 611 movif32 [esp+0x38], dstd 612 mov kd, 1 613 %if %1 == 4 614 movq m4, [stkq+32*0] 615 movhps m4, [stkq+32*1] 616 %else 617 mova m4, [stkq+32*0] ; px 618 %endif 619 pxor m0, m0 ; sum 620 mova m7, m4 ; max 621 mova m8, m4 ; min 622.k_loop: 623 MOVDDUP m2, [priq+kq*8] 624 %if ARCH_X86_64 625 ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1 626 MOVDDUP m2, [tapq+12*8+kq*8] 627 ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1 628 ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1 629 %else 630 ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1 631 MOVDDUP m2, [tapq+12*8+kq*8] 632 ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 633 MOVDDUP m2, [tapq+12*8+kq*8] 634 ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 635 %endif 636 dec kd 637 jge .k_loop 638 movif32 dstq, [esp+0x38] 639 movif32 strideq, [esp+0x3C] 640 CDEF_FILTER_END %1, 1 641 dec hd 642 jg .v_loop 643 RET 644 645.pri_only: 646%if ARCH_X86_64 647 DEFINE_ARGS dst, stride, pridmp, damping, pri, tap, zero 648 lea tapq, [tap_table] 649 %else 650 DEFINE_ARGS dst, pridmp, zero, damping, pri, tap 651 %endif 652 and prid, 1 653 xor zerod, zerod 654 sub dampingd, pridmpd 655 cmovs dampingd, zerod 656 add prid, prid 657 PSHUFB_0 m1, m7 658 MOVDDUP m7, [tapq+dampingq*8] 659 mov [rsp+0x00], dampingq 660 %if ARCH_X86_64 661 DEFINE_ARGS dst, stride, dir, stk, pri, tap, k, off, h 662 %else 663 mov [rsp+0x04], zerod 664 DEFINE_ARGS dst, stride, dir, stk, pri, tap, h 665 %endif 666 mov dird, r6m 667 lea stkq, [px] 668 lea priq, [tapq+8*8+priq*8] 669 mov hd, %1*%2/8 670 lea dirq, [tapq+dirq*2] 671.pri_v_loop: 672 movif32 [esp+0x38], dstd 673 mov kd, 1 674 %if %1 == 4 675 movq m4, [stkq+32*0] 676 movhps m4, [stkq+32*1] 677 %else 678 mova m4, [stkq+32*0] 679 %endif 680 pxor m0, m0 681.pri_k_loop: 682 MOVDDUP m2, [priq+kq*8] 683 ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0 684 dec kd 685 jge .pri_k_loop 686 movif32 dstq, [esp+0x38] 687 movif32 strideq, [esp+0x3C] 688 CDEF_FILTER_END %1, 0 689 dec hd 690 jg .pri_v_loop 691 RET 692 693.sec_only: 694%if ARCH_X86_64 695 DEFINE_ARGS dst, stride, dir, damping, tap, sec, zero 696%else 697 DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero 698%endif 699 movd m1, r5m 700 bsr secd, secd 701 mov dird, r6m 702 xor zerod, zerod 703 sub dampingd, secd 704 cmovs dampingd, zerod 705 PSHUFB_0 m1, m7 706 %if ARCH_X86_64 707 lea tapq, [tap_table] 708 %else 709 mov [rsp+0x04], zerod 710 %endif 711 mov [rsp+0x00], dampingq 712 MOVDDUP m7, [tapq+dampingq*8] 713 lea dirq, [tapq+dirq*2] 714 %if ARCH_X86_64 715 DEFINE_ARGS dst, stride, dir, stk, tap, off, k, h 716 %else 717 DEFINE_ARGS dst, stride, off, stk, dir, tap, h 718 %endif 719 lea stkq, [px] 720 mov hd, %1*%2/8 721.sec_v_loop: 722 mov kd, 1 723 %if %1 == 4 724 movq m4, [stkq+32*0] 725 movhps m4, [stkq+32*1] 726 %else 727 mova m4, [stkq+32*0] 728 %endif 729 pxor m0, m0 730.sec_k_loop: 731 MOVDDUP m2, [tapq+12*8+kq*8] 732 ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0 733 %if ARCH_X86_32 734 MOVDDUP m2, [tapq+12*8+kq*8] 735 %endif 736 ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0 737 dec kd 738 jge .sec_k_loop 739 movif32 strideq, [esp+0x3C] 740 CDEF_FILTER_END %1, 0 741 dec hd 742 jg .sec_v_loop 743 RET 744%endmacro 745 746%macro MULLD 2 747 %if cpuflag(sse4) 748 pmulld %1, %2 749 %else 750 %if ARCH_X86_32 751 %define m15 m1 752 %endif 753 pmulhuw m15, %1, %2 754 pmullw %1, %2 755 pslld m15, 16 756 paddd %1, m15 757 %endif 758%endmacro 759 760%macro CDEF_DIR 0 761 %if ARCH_X86_64 762cglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var 763 lea r6, [strideq*3] 764 movq m1, [srcq+strideq*0] 765 movhps m1, [srcq+strideq*1] 766 movq m3, [srcq+strideq*2] 767 movhps m3, [srcq+r6 ] 768 lea srcq, [srcq+strideq*4] 769 movq m5, [srcq+strideq*0] 770 movhps m5, [srcq+strideq*1] 771 movq m7, [srcq+strideq*2] 772 movhps m7, [srcq+r6 ] 773 774 pxor m8, m8 775 psadbw m9, m1, m8 776 psadbw m2, m3, m8 777 psadbw m4, m5, m8 778 psadbw m6, m7, m8 779 packssdw m9, m2 780 packssdw m4, m6 781 packssdw m9, m4 782 783 punpcklbw m0, m1, m8 784 punpckhbw m1, m8 785 punpcklbw m2, m3, m8 786 punpckhbw m3, m8 787 punpcklbw m4, m5, m8 788 punpckhbw m5, m8 789 punpcklbw m6, m7, m8 790 punpckhbw m7, m8 791cglobal_label .main 792 mova m8, [pw_128] 793 psubw m0, m8 794 psubw m1, m8 795 psubw m2, m8 796 psubw m3, m8 797 psubw m4, m8 798 psubw m5, m8 799 psubw m6, m8 800 psubw m7, m8 801 psllw m8, 3 802 psubw m9, m8 ; partial_sum_hv[0] 803 804 paddw m8, m0, m1 805 paddw m10, m2, m3 806 paddw m8, m4 807 paddw m10, m5 808 paddw m8, m6 809 paddw m10, m7 810 paddw m8, m10 ; partial_sum_hv[1] 811 812 pmaddwd m8, m8 813 pmaddwd m9, m9 814 phaddd m9, m8 815 SWAP m8, m9 816 MULLD m8, [div_table%+SUFFIX+48] 817 818 pslldq m9, m1, 2 819 psrldq m10, m1, 14 820 pslldq m11, m2, 4 821 psrldq m12, m2, 12 822 pslldq m13, m3, 6 823 psrldq m14, m3, 10 824 paddw m9, m0 825 paddw m10, m12 826 paddw m11, m13 827 paddw m10, m14 ; partial_sum_diag[0] top/right half 828 paddw m9, m11 ; partial_sum_diag[0] top/left half 829 pslldq m11, m4, 8 830 psrldq m12, m4, 8 831 pslldq m13, m5, 10 832 psrldq m14, m5, 6 833 paddw m9, m11 834 paddw m10, m12 835 paddw m9, m13 836 paddw m10, m14 837 pslldq m11, m6, 12 838 psrldq m12, m6, 4 839 pslldq m13, m7, 14 840 psrldq m14, m7, 2 841 paddw m9, m11 842 paddw m10, m12 843 paddw m9, m13 ; partial_sum_diag[0][0-7] 844 paddw m10, m14 ; partial_sum_diag[0][8-14,zero] 845 pshufb m10, [shufw_6543210x] 846 punpckhwd m11, m9, m10 847 punpcklwd m9, m10 848 pmaddwd m11, m11 849 pmaddwd m9, m9 850 MULLD m11, [div_table%+SUFFIX+16] 851 MULLD m9, [div_table%+SUFFIX+0] 852 paddd m9, m11 ; cost[0a-d] 853 854 pslldq m10, m0, 14 855 psrldq m11, m0, 2 856 pslldq m12, m1, 12 857 psrldq m13, m1, 4 858 pslldq m14, m2, 10 859 psrldq m15, m2, 6 860 paddw m10, m12 861 paddw m11, m13 862 paddw m10, m14 863 paddw m11, m15 864 pslldq m12, m3, 8 865 psrldq m13, m3, 8 866 pslldq m14, m4, 6 867 psrldq m15, m4, 10 868 paddw m10, m12 869 paddw m11, m13 870 paddw m10, m14 871 paddw m11, m15 872 pslldq m12, m5, 4 873 psrldq m13, m5, 12 874 pslldq m14, m6, 2 875 psrldq m15, m6, 14 876 paddw m10, m12 877 paddw m11, m13 878 paddw m10, m14 879 paddw m11, m15 ; partial_sum_diag[1][8-14,zero] 880 paddw m10, m7 ; partial_sum_diag[1][0-7] 881 pshufb m11, [shufw_6543210x] 882 punpckhwd m12, m10, m11 883 punpcklwd m10, m11 884 pmaddwd m12, m12 885 pmaddwd m10, m10 886 MULLD m12, [div_table%+SUFFIX+16] 887 MULLD m10, [div_table%+SUFFIX+0] 888 paddd m10, m12 ; cost[4a-d] 889 phaddd m9, m10 ; cost[0a/b,4a/b] 890 891 paddw m10, m0, m1 892 paddw m11, m2, m3 893 paddw m12, m4, m5 894 paddw m13, m6, m7 895 phaddw m0, m4 896 phaddw m1, m5 897 phaddw m2, m6 898 phaddw m3, m7 899 900 ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1) 901 pslldq m4, m11, 2 902 psrldq m5, m11, 14 903 pslldq m6, m12, 4 904 psrldq m7, m12, 12 905 pslldq m14, m13, 6 906 psrldq m15, m13, 10 907 paddw m4, m10 908 paddw m5, m7 909 paddw m4, m6 910 paddw m5, m15 ; partial_sum_alt[3] right 911 paddw m4, m14 ; partial_sum_alt[3] left 912 pshuflw m6, m5, q3012 913 punpckhwd m5, m4 914 punpcklwd m4, m6 915 pmaddwd m5, m5 916 pmaddwd m4, m4 917 MULLD m5, [div_table%+SUFFIX+48] 918 MULLD m4, [div_table%+SUFFIX+32] 919 paddd m4, m5 ; cost[7a-d] 920 921 pslldq m5, m10, 6 922 psrldq m6, m10, 10 923 pslldq m7, m11, 4 924 psrldq m10, m11, 12 925 pslldq m11, m12, 2 926 psrldq m12, 14 927 paddw m5, m7 928 paddw m6, m10 929 paddw m5, m11 930 paddw m6, m12 931 paddw m5, m13 932 pshuflw m7, m6, q3012 933 punpckhwd m6, m5 934 punpcklwd m5, m7 935 pmaddwd m6, m6 936 pmaddwd m5, m5 937 MULLD m6, [div_table%+SUFFIX+48] 938 MULLD m5, [div_table%+SUFFIX+32] 939 paddd m5, m6 ; cost[5a-d] 940 941 pslldq m6, m1, 2 942 psrldq m7, m1, 14 943 pslldq m10, m2, 4 944 psrldq m11, m2, 12 945 pslldq m12, m3, 6 946 psrldq m13, m3, 10 947 paddw m6, m0 948 paddw m7, m11 949 paddw m6, m10 950 paddw m7, m13 ; partial_sum_alt[3] right 951 paddw m6, m12 ; partial_sum_alt[3] left 952 pshuflw m10, m7, q3012 953 punpckhwd m7, m6 954 punpcklwd m6, m10 955 pmaddwd m7, m7 956 pmaddwd m6, m6 957 MULLD m7, [div_table%+SUFFIX+48] 958 MULLD m6, [div_table%+SUFFIX+32] 959 paddd m6, m7 ; cost[1a-d] 960 961 pshufd m0, m0, q1032 962 pshufd m1, m1, q1032 963 pshufd m2, m2, q1032 964 pshufd m3, m3, q1032 965 966 pslldq m10, m0, 6 967 psrldq m11, m0, 10 968 pslldq m12, m1, 4 969 psrldq m13, m1, 12 970 pslldq m14, m2, 2 971 psrldq m2, 14 972 paddw m10, m12 973 paddw m11, m13 974 paddw m10, m14 975 paddw m11, m2 976 paddw m10, m3 977 pshuflw m12, m11, q3012 978 punpckhwd m11, m10 979 punpcklwd m10, m12 980 pmaddwd m11, m11 981 pmaddwd m10, m10 982 MULLD m11, [div_table%+SUFFIX+48] 983 MULLD m10, [div_table%+SUFFIX+32] 984 paddd m10, m11 ; cost[3a-d] 985 986 phaddd m9, m8 ; cost[0,4,2,6] 987 phaddd m6, m10 988 phaddd m5, m4 989 phaddd m6, m5 ; cost[1,3,5,7] 990 pshufd m4, m9, q3120 991 992 ; now find the best cost 993 %if cpuflag(sse4) 994 pmaxsd m9, m6 995 pshufd m0, m9, q1032 996 pmaxsd m0, m9 997 pshufd m1, m0, q2301 998 pmaxsd m0, m1 ; best cost 999 %else 1000 pcmpgtd m0, m9, m6 1001 pand m9, m0 1002 pandn m0, m6 1003 por m9, m0 1004 pshufd m1, m9, q1032 1005 pcmpgtd m0, m9, m1 1006 pand m9, m0 1007 pandn m0, m1 1008 por m9, m0 1009 pshufd m1, m9, q2301 1010 pcmpgtd m0, m9, m1 1011 pand m9, m0 1012 pandn m0, m1 1013 por m0, m9 1014 %endif 1015 1016 ; get direction and variance 1017 punpckhdq m1, m4, m6 1018 punpckldq m4, m6 1019 psubd m2, m0, m1 1020 psubd m3, m0, m4 1021%if WIN64 1022 WIN64_RESTORE_XMM 1023 %define tmp rsp+stack_offset+8 1024%else 1025 %define tmp rsp-40 1026%endif 1027 mova [tmp+0x00], m2 ; emulate ymm in stack 1028 mova [tmp+0x10], m3 1029 pcmpeqd m1, m0 ; compute best cost mask 1030 pcmpeqd m4, m0 1031 packssdw m4, m1 1032 pmovmskb eax, m4 ; get byte-idx from mask 1033 tzcnt eax, eax 1034 mov r1d, [tmp+rax*2] ; get idx^4 complement from emulated ymm 1035 shr eax, 1 ; get direction by converting byte-idx to word-idx 1036 shr r1d, 10 1037 mov [varq], r1d 1038 %else 1039cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3 1040%define base r2-shufw_6543210x 1041 LEA r2, shufw_6543210x 1042 pxor m0, m0 1043 lea stride3q, [strideq*3] 1044 movq m5, [srcq+strideq*0] 1045 movhps m5, [srcq+strideq*1] 1046 movq m7, [srcq+strideq*2] 1047 movhps m7, [srcq+stride3q] 1048 mova m1, [base+pw_128] 1049 psadbw m2, m5, m0 1050 psadbw m3, m7, m0 1051 packssdw m2, m3 1052 punpcklbw m4, m5, m0 1053 punpckhbw m5, m0 1054 punpcklbw m6, m7, m0 1055 punpckhbw m7, m0 1056 psubw m4, m1 1057 psubw m5, m1 1058 psubw m6, m1 1059 psubw m7, m1 1060 1061 mova [esp+0x00], m4 1062 mova [esp+0x10], m5 1063 mova [esp+0x20], m6 1064 mova [esp+0x50], m7 1065 1066 lea srcq, [srcq+strideq*4] 1067 movq m5, [srcq+strideq*0] 1068 movhps m5, [srcq+strideq*1] 1069 movq m7, [srcq+strideq*2] 1070 movhps m7, [srcq+stride3q] 1071 psadbw m3, m5, m0 1072 psadbw m0, m7 1073 packssdw m3, m0 1074 pxor m0, m0 1075 punpcklbw m4, m5, m0 1076 punpckhbw m5, m0 1077 punpcklbw m6, m7, m0 1078 punpckhbw m7, m0 1079cglobal_label .main 1080 psubw m4, m1 1081 psubw m5, m1 1082 psubw m6, m1 1083 psubw m7, m1 1084 packssdw m2, m3 1085 psllw m1, 3 1086 psubw m2, m1 ; partial_sum_hv[0] 1087 pmaddwd m2, m2 1088 1089 mova m3, [esp+0x50] 1090 mova m0, [esp+0x00] 1091 paddw m0, [esp+0x10] 1092 paddw m1, m3, [esp+0x20] 1093 paddw m0, m4 1094 paddw m1, m5 1095 paddw m0, m6 1096 paddw m1, m7 1097 paddw m0, m1 ; partial_sum_hv[1] 1098 pmaddwd m0, m0 1099 1100 phaddd m2, m0 1101 MULLD m2, [base+div_table%+SUFFIX+48] 1102 mova [esp+0x30], m2 1103 1104 mova m1, [esp+0x10] 1105 pslldq m0, m1, 2 1106 psrldq m1, 14 1107 paddw m0, [esp+0x00] 1108 pslldq m2, m3, 6 1109 psrldq m3, 10 1110 paddw m0, m2 1111 paddw m1, m3 1112 mova m3, [esp+0x20] 1113 pslldq m2, m3, 4 1114 psrldq m3, 12 1115 paddw m0, m2 ; partial_sum_diag[0] top/left half 1116 paddw m1, m3 ; partial_sum_diag[0] top/right half 1117 pslldq m2, m4, 8 1118 psrldq m3, m4, 8 1119 paddw m0, m2 1120 paddw m1, m3 1121 pslldq m2, m5, 10 1122 psrldq m3, m5, 6 1123 paddw m0, m2 1124 paddw m1, m3 1125 pslldq m2, m6, 12 1126 psrldq m3, m6, 4 1127 paddw m0, m2 1128 paddw m1, m3 1129 pslldq m2, m7, 14 1130 psrldq m3, m7, 2 1131 paddw m0, m2 ; partial_sum_diag[0][0-7] 1132 paddw m1, m3 ; partial_sum_diag[0][8-14,zero] 1133 mova m3, [esp+0x50] 1134 pshufb m1, [base+shufw_6543210x] 1135 punpckhwd m2, m0, m1 1136 punpcklwd m0, m1 1137 pmaddwd m2, m2 1138 pmaddwd m0, m0 1139 MULLD m2, [base+div_table%+SUFFIX+16] 1140 MULLD m0, [base+div_table%+SUFFIX+ 0] 1141 paddd m0, m2 ; cost[0a-d] 1142 mova [esp+0x40], m0 1143 1144 mova m1, [esp+0x00] 1145 pslldq m0, m1, 14 1146 psrldq m1, 2 1147 paddw m0, m7 1148 pslldq m2, m3, 8 1149 psrldq m3, 8 1150 paddw m0, m2 1151 paddw m1, m3 1152 mova m3, [esp+0x20] 1153 pslldq m2, m3, 10 1154 psrldq m3, 6 1155 paddw m0, m2 1156 paddw m1, m3 1157 mova m3, [esp+0x10] 1158 pslldq m2, m3, 12 1159 psrldq m3, 4 1160 paddw m0, m2 1161 paddw m1, m3 1162 pslldq m2, m4, 6 1163 psrldq m3, m4, 10 1164 paddw m0, m2 1165 paddw m1, m3 1166 pslldq m2, m5, 4 1167 psrldq m3, m5, 12 1168 paddw m0, m2 1169 paddw m1, m3 1170 pslldq m2, m6, 2 1171 psrldq m3, m6, 14 1172 paddw m0, m2 ; partial_sum_diag[1][0-7] 1173 paddw m1, m3 ; partial_sum_diag[1][8-14,zero] 1174 mova m3, [esp+0x50] 1175 pshufb m1, [base+shufw_6543210x] 1176 punpckhwd m2, m0, m1 1177 punpcklwd m0, m1 1178 pmaddwd m2, m2 1179 pmaddwd m0, m0 1180 MULLD m2, [base+div_table%+SUFFIX+16] 1181 MULLD m0, [base+div_table%+SUFFIX+ 0] 1182 paddd m0, m2 ; cost[4a-d] 1183 phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b] 1184 phaddd m1, [esp+0x30] ; cost[0,4,2,6] 1185 mova [esp+0x30], m1 1186 1187 phaddw m0, [esp+0x00], m4 1188 phaddw m1, [esp+0x10], m5 1189 paddw m4, m5 1190 mova m2, [esp+0x20] 1191 paddw m5, m2, m3 1192 phaddw m2, m6 1193 paddw m6, m7 1194 phaddw m3, m7 1195 mova m7, [esp+0x00] 1196 paddw m7, [esp+0x10] 1197 mova [esp+0x00], m0 1198 mova [esp+0x10], m1 1199 mova [esp+0x20], m2 1200 1201 pslldq m1, m4, 4 1202 pslldq m2, m6, 6 1203 pslldq m0, m5, 2 1204 paddw m1, m2 1205 paddw m0, m7 1206 psrldq m2, m5, 14 1207 paddw m0, m1 ; partial_sum_alt[3] left 1208 psrldq m1, m4, 12 1209 paddw m1, m2 1210 psrldq m2, m6, 10 1211 paddw m1, m2 ; partial_sum_alt[3] right 1212 pshuflw m1, m1, q3012 1213 punpckhwd m2, m0, m1 1214 punpcklwd m0, m1 1215 pmaddwd m2, m2 1216 pmaddwd m0, m0 1217 MULLD m2, [base+div_table%+SUFFIX+48] 1218 MULLD m0, [base+div_table%+SUFFIX+32] 1219 paddd m0, m2 ; cost[7a-d] 1220 mova [esp+0x40], m0 1221 1222 pslldq m0, m7, 6 1223 psrldq m7, 10 1224 pslldq m1, m5, 4 1225 psrldq m5, 12 1226 pslldq m2, m4, 2 1227 psrldq m4, 14 1228 paddw m0, m6 1229 paddw m7, m5 1230 paddw m0, m1 1231 paddw m7, m4 1232 paddw m0, m2 1233 pshuflw m2, m7, q3012 1234 punpckhwd m7, m0 1235 punpcklwd m0, m2 1236 pmaddwd m7, m7 1237 pmaddwd m0, m0 1238 MULLD m7, [base+div_table%+SUFFIX+48] 1239 MULLD m0, [base+div_table%+SUFFIX+32] 1240 paddd m0, m7 ; cost[5a-d] 1241 mova [esp+0x50], m0 1242 1243 mova m7, [esp+0x10] 1244 mova m2, [esp+0x20] 1245 pslldq m0, m7, 2 1246 psrldq m7, 14 1247 pslldq m4, m2, 4 1248 psrldq m2, 12 1249 pslldq m5, m3, 6 1250 psrldq m6, m3, 10 1251 paddw m0, [esp+0x00] 1252 paddw m7, m2 1253 paddw m4, m5 1254 paddw m7, m6 ; partial_sum_alt[3] right 1255 paddw m0, m4 ; partial_sum_alt[3] left 1256 pshuflw m2, m7, q3012 1257 punpckhwd m7, m0 1258 punpcklwd m0, m2 1259 pmaddwd m7, m7 1260 pmaddwd m0, m0 1261 MULLD m7, [base+div_table%+SUFFIX+48] 1262 MULLD m0, [base+div_table%+SUFFIX+32] 1263 paddd m0, m7 ; cost[1a-d] 1264 SWAP m0, m4 1265 1266 pshufd m0, [esp+0x00], q1032 1267 pshufd m1, [esp+0x10], q1032 1268 pshufd m2, [esp+0x20], q1032 1269 pshufd m3, m3, q1032 1270 mova [esp+0x00], m4 1271 1272 pslldq m4, m0, 6 1273 psrldq m0, 10 1274 pslldq m5, m1, 4 1275 psrldq m1, 12 1276 pslldq m6, m2, 2 1277 psrldq m2, 14 1278 paddw m4, m3 1279 paddw m0, m1 1280 paddw m5, m6 1281 paddw m0, m2 1282 paddw m4, m5 1283 pshuflw m2, m0, q3012 1284 punpckhwd m0, m4 1285 punpcklwd m4, m2 1286 pmaddwd m0, m0 1287 pmaddwd m4, m4 1288 MULLD m0, [base+div_table%+SUFFIX+48] 1289 MULLD m4, [base+div_table%+SUFFIX+32] 1290 paddd m4, m0 ; cost[3a-d] 1291 1292 mova m1, [esp+0x00] 1293 mova m2, [esp+0x50] 1294 mova m0, [esp+0x30] ; cost[0,4,2,6] 1295 phaddd m1, m4 1296 phaddd m2, [esp+0x40] ; cost[1,3,5,7] 1297 phaddd m1, m2 1298 pshufd m2, m0, q3120 1299 1300 ; now find the best cost 1301 %if cpuflag(sse4) 1302 pmaxsd m0, m1 1303 pshufd m3, m0, q1032 1304 pmaxsd m3, m0 1305 pshufd m0, m3, q2301 1306 pmaxsd m0, m3 1307 %else 1308 pcmpgtd m3, m0, m1 1309 pand m0, m3 1310 pandn m3, m1 1311 por m0, m3 1312 pshufd m4, m0, q1032 1313 pcmpgtd m3, m0, m4 1314 pand m0, m3 1315 pandn m3, m4 1316 por m0, m3 1317 pshufd m4, m0, q2301 1318 pcmpgtd m3, m0, m4 1319 pand m0, m3 1320 pandn m3, m4 1321 por m0, m3 1322 %endif 1323 1324 ; get direction and variance 1325 mov vard, varm 1326 punpckhdq m3, m2, m1 1327 punpckldq m2, m1 1328 psubd m1, m0, m3 1329 psubd m4, m0, m2 1330 mova [esp+0x00], m1 ; emulate ymm in stack 1331 mova [esp+0x10], m4 1332 pcmpeqd m3, m0 ; compute best cost mask 1333 pcmpeqd m2, m0 1334 packssdw m2, m3 1335 pmovmskb eax, m2 ; get byte-idx from mask 1336 tzcnt eax, eax 1337 mov r1d, [esp+eax*2] ; get idx^4 complement from emulated ymm 1338 shr eax, 1 ; get direction by converting byte-idx to word-idx 1339 shr r1d, 10 1340 mov [vard], r1d 1341 %endif 1342 1343 RET 1344%endmacro 1345 1346INIT_XMM sse4 1347CDEF_FILTER 8, 8 1348CDEF_FILTER 4, 8 1349CDEF_FILTER 4, 4 1350CDEF_DIR 1351 1352INIT_XMM ssse3 1353CDEF_FILTER 8, 8 1354CDEF_FILTER 4, 8 1355CDEF_FILTER 4, 4 1356CDEF_DIR 1357 1358INIT_XMM sse2 1359CDEF_FILTER 8, 8 1360CDEF_FILTER 4, 8 1361CDEF_FILTER 4, 4 1362