1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; Copyright (c) 2017-2021, The rav1e contributors 4; Copyright (c) 2021, Nathan Egge 5; All rights reserved. 6; 7; Redistribution and use in source and binary forms, with or without 8; modification, are permitted provided that the following conditions are met: 9; 10; 1. Redistributions of source code must retain the above copyright notice, this 11; list of conditions and the following disclaimer. 12; 13; 2. Redistributions in binary form must reproduce the above copyright notice, 14; this list of conditions and the following disclaimer in the documentation 15; and/or other materials provided with the distribution. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28%include "config.asm" 29%include "ext/x86/x86inc.asm" 30 31SECTION_RODATA 32 33%macro DUP8 1-* 34 %rep %0 35 times 8 dw %1 36 %rotate 1 37 %endrep 38%endmacro 39 40pri_taps: DUP8 4, 2, 3, 3 41dir_table: db 1 * 32 + 0, 2 * 32 + 0 42 db 1 * 32 + 0, 2 * 32 - 2 43 db -1 * 32 + 2, -2 * 32 + 4 44 db 0 * 32 + 2, -1 * 32 + 4 45 db 0 * 32 + 2, 0 * 32 + 4 46 db 0 * 32 + 2, 1 * 32 + 4 47 db 1 * 32 + 2, 2 * 32 + 4 48 db 1 * 32 + 0, 2 * 32 + 2 49 db 1 * 32 + 0, 2 * 32 + 0 50 db 1 * 32 + 0, 2 * 32 - 2 51 db -1 * 32 + 2, -2 * 32 + 4 52 db 0 * 32 + 2, -1 * 32 + 4 53 54dir_shift: times 4 dw 0x4000 55 times 4 dw 0x1000 56 57pw_128: times 4 dw 128 58pw_2048: times 8 dw 2048 59pw_m16384: times 8 dw -16384 60 61cextern cdef_dir_8bpc_ssse3.main 62cextern cdef_dir_8bpc_sse4.main 63cextern shufw_6543210x 64 65SECTION .text 66 67%if ARCH_X86_32 68DECLARE_REG_TMP 5, 3 69%elif WIN64 70DECLARE_REG_TMP 8, 4 71%else 72DECLARE_REG_TMP 8, 6 73%endif 74 75%macro CDEF_FILTER 2 ; w, h 76%if ARCH_X86_64 77 DEFINE_ARGS dst, stride, _, tmp, pridmp, pri, sec, dir 78 mova m8, [base+pw_2048] 79%else 80 DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir 81 %define m8 [base+pw_2048] 82 %define m9 [rsp+16*1+gprsize] 83 %define m10 [rsp+16*2+gprsize] 84%endif 85 movifnidn prid, r5m 86 movifnidn secd, r6m 87 test prid, prid 88 jz .sec_only 89 movd m6, r5m 90%if ARCH_X86_32 91 mov [rsp+24], pridmpd 92%endif 93 bsr pridmpd, prid 94 lea tmpd, [priq*4] 95 cmp dword r10m, 0x3ff ; if (bpc == 10) 96 cmove prid, tmpd ; pri <<= 2 97 mov tmpd, r8m ; damping 98 mov dird, r7m 99 and prid, 16 100 pshufb m6, m7 ; splat 101 lea dirq, [base+dir_table+dirq*2] 102 lea priq, [base+pri_taps+priq*2] 103 test secd, secd 104 jz .pri_only 105 mova [rsp], m6 106 movd m6, secd 107 tzcnt secd, secd 108 sub pridmpd, tmpd 109 sub tmpd, secd 110 pshufb m6, m7 111 xor secd, secd 112 neg pridmpd 113 cmovs pridmpd, secd 114%if ARCH_X86_32 115 mov [pri_shift+4], secd 116 mov [sec_shift+4], secd 117%endif 118 mov [pri_shift+0], pridmpq 119 mov [sec_shift+0], tmpq 120 lea tmpq, [px] 121%if WIN64 122 movaps r4m, m9 123 movaps r6m, m10 124%elif ARCH_X86_32 125 mov pridmpd, [rsp+24] 126%endif 127%rep %1*%2/8 128 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec 129%endrep 130%if WIN64 131 movaps m9, r4m 132 movaps m10, r6m 133%endif 134 jmp .end 135.pri_only: 136 sub tmpd, pridmpd 137 cmovs tmpd, secd 138%if ARCH_X86_32 139 mov pridmpd, [rsp+24] 140 mov [pri_shift+4], secd 141%endif 142 mov [pri_shift+0], tmpq 143 lea tmpq, [px] 144%rep %1*%2/8 145 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri 146%endrep 147.end: 148 RET 149.sec_only: 150 mov tmpd, r8m ; damping 151 movd m6, r6m 152 tzcnt secd, secd 153 mov dird, r7m 154 pshufb m6, m7 155 sub tmpd, secd 156 lea dirq, [base+dir_table+dirq*2] 157%if ARCH_X86_32 158 mov [sec_shift+4], prid 159%endif 160 mov [sec_shift+0], tmpq 161 lea tmpq, [px] 162%rep %1*%2/8 163 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec 164%endrep 165 jmp .end 166%if %1 == %2 167 %if ARCH_X86_64 168 DEFINE_ARGS dst, stride, _, tmp, off, pri, _, dir 169 %else 170 DEFINE_ARGS dst, stride, tmp, off, pri, _, dir 171 %endif 172ALIGN function_align 173.pri: 174 movsx offq, byte [dirq+4] ; off_k0 175%if %1 == 4 176 movq m1, [dstq+strideq*0] 177 movhps m1, [dstq+strideq*1] 178 movq m2, [tmpq+offq+32*0] ; k0p0 179 movhps m2, [tmpq+offq+32*1] 180 neg offq 181 movq m3, [tmpq+offq+32*0] ; k0p1 182 movhps m3, [tmpq+offq+32*1] 183%else 184 mova m1, [dstq] 185 movu m2, [tmpq+offq] 186 neg offq 187 movu m3, [tmpq+offq] 188%endif 189 movsx offq, byte [dirq+5] ; off_k1 190 psubw m2, m1 ; diff_k0p0 191 psubw m3, m1 ; diff_k0p1 192 pabsw m4, m2 ; adiff_k0p0 193 psrlw m5, m4, [pri_shift+gprsize] 194 psubusw m0, m6, m5 195 pabsw m5, m3 ; adiff_k0p1 196 pminsw m0, m4 197 psrlw m4, m5, [pri_shift+gprsize] 198 psignw m0, m2 ; constrain(diff_k0p0) 199 psubusw m2, m6, m4 200 pminsw m2, m5 201%if %1 == 4 202 movq m4, [tmpq+offq+32*0] ; k1p0 203 movhps m4, [tmpq+offq+32*1] 204 neg offq 205 movq m5, [tmpq+offq+32*0] ; k1p1 206 movhps m5, [tmpq+offq+32*1] 207%else 208 movu m4, [tmpq+offq] 209 neg offq 210 movu m5, [tmpq+offq] 211%endif 212 psubw m4, m1 ; diff_k1p0 213 psubw m5, m1 ; diff_k1p1 214 psignw m2, m3 ; constrain(diff_k0p1) 215 pabsw m3, m4 ; adiff_k1p0 216 paddw m0, m2 ; constrain(diff_k0) 217 psrlw m2, m3, [pri_shift+gprsize] 218 psubusw m7, m6, m2 219 pabsw m2, m5 ; adiff_k1p1 220 pminsw m7, m3 221 psrlw m3, m2, [pri_shift+gprsize] 222 psignw m7, m4 ; constrain(diff_k1p0) 223 psubusw m4, m6, m3 224 pminsw m4, m2 225 psignw m4, m5 ; constrain(diff_k1p1) 226 paddw m7, m4 ; constrain(diff_k1) 227 pmullw m0, [priq+16*0] ; pri_tap_k0 228 pmullw m7, [priq+16*1] ; pri_tap_k1 229 paddw m0, m7 ; sum 230 psraw m2, m0, 15 231 paddw m0, m2 232 pmulhrsw m0, m8 233 paddw m0, m1 234%if %1 == 4 235 add tmpq, 32*2 236 movq [dstq+strideq*0], m0 237 movhps [dstq+strideq*1], m0 238 lea dstq, [dstq+strideq*2] 239%else 240 add tmpq, 32 241 mova [dstq], m0 242 add dstq, strideq 243%endif 244 ret 245ALIGN function_align 246.sec: 247 movsx offq, byte [dirq+8] ; off1_k0 248%if %1 == 4 249 movq m1, [dstq+strideq*0] 250 movhps m1, [dstq+strideq*1] 251 movq m2, [tmpq+offq+32*0] ; k0s0 252 movhps m2, [tmpq+offq+32*1] 253 neg offq 254 movq m3, [tmpq+offq+32*0] ; k0s1 255 movhps m3, [tmpq+offq+32*1] 256%else 257 mova m1, [dstq] 258 movu m2, [tmpq+offq] 259 neg offq 260 movu m3, [tmpq+offq] 261%endif 262 movsx offq, byte [dirq+0] ; off2_k0 263 psubw m2, m1 ; diff_k0s0 264 psubw m3, m1 ; diff_k0s1 265 pabsw m4, m2 ; adiff_k0s0 266 psrlw m5, m4, [sec_shift+gprsize] 267 psubusw m0, m6, m5 268 pabsw m5, m3 ; adiff_k0s1 269 pminsw m0, m4 270 psrlw m4, m5, [sec_shift+gprsize] 271 psignw m0, m2 ; constrain(diff_k0s0) 272 psubusw m2, m6, m4 273 pminsw m2, m5 274%if %1 == 4 275 movq m4, [tmpq+offq+32*0] ; k0s2 276 movhps m4, [tmpq+offq+32*1] 277 neg offq 278 movq m5, [tmpq+offq+32*0] ; k0s3 279 movhps m5, [tmpq+offq+32*1] 280%else 281 movu m4, [tmpq+offq] 282 neg offq 283 movu m5, [tmpq+offq] 284%endif 285 movsx offq, byte [dirq+9] ; off1_k1 286 psubw m4, m1 ; diff_k0s2 287 psubw m5, m1 ; diff_k0s3 288 psignw m2, m3 ; constrain(diff_k0s1) 289 pabsw m3, m4 ; adiff_k0s2 290 paddw m0, m2 291 psrlw m2, m3, [sec_shift+gprsize] 292 psubusw m7, m6, m2 293 pabsw m2, m5 ; adiff_k0s3 294 pminsw m7, m3 295 psrlw m3, m2, [sec_shift+gprsize] 296 psignw m7, m4 ; constrain(diff_k0s2) 297 psubusw m4, m6, m3 298 pminsw m4, m2 299%if %1 == 4 300 movq m2, [tmpq+offq+32*0] ; k1s0 301 movhps m2, [tmpq+offq+32*1] 302 neg offq 303 movq m3, [tmpq+offq+32*0] ; k1s1 304 movhps m3, [tmpq+offq+32*1] 305%else 306 movu m2, [tmpq+offq] 307 neg offq 308 movu m3, [tmpq+offq] 309%endif 310 movsx offq, byte [dirq+1] ; off2_k1 311 paddw m0, m7 312 psignw m4, m5 ; constrain(diff_k0s3) 313 paddw m0, m4 ; constrain(diff_k0) 314 psubw m2, m1 ; diff_k1s0 315 psubw m3, m1 ; diff_k1s1 316 paddw m0, m0 ; sec_tap_k0 317 pabsw m4, m2 ; adiff_k1s0 318 psrlw m5, m4, [sec_shift+gprsize] 319 psubusw m7, m6, m5 320 pabsw m5, m3 ; adiff_k1s1 321 pminsw m7, m4 322 psrlw m4, m5, [sec_shift+gprsize] 323 psignw m7, m2 ; constrain(diff_k1s0) 324 psubusw m2, m6, m4 325 pminsw m2, m5 326%if %1 == 4 327 movq m4, [tmpq+offq+32*0] ; k1s2 328 movhps m4, [tmpq+offq+32*1] 329 neg offq 330 movq m5, [tmpq+offq+32*0] ; k1s3 331 movhps m5, [tmpq+offq+32*1] 332%else 333 movu m4, [tmpq+offq] 334 neg offq 335 movu m5, [tmpq+offq] 336%endif 337 paddw m0, m7 338 psubw m4, m1 ; diff_k1s2 339 psubw m5, m1 ; diff_k1s3 340 psignw m2, m3 ; constrain(diff_k1s1) 341 pabsw m3, m4 ; adiff_k1s2 342 paddw m0, m2 343 psrlw m2, m3, [sec_shift+gprsize] 344 psubusw m7, m6, m2 345 pabsw m2, m5 ; adiff_k1s3 346 pminsw m7, m3 347 psrlw m3, m2, [sec_shift+gprsize] 348 psignw m7, m4 ; constrain(diff_k1s2) 349 psubusw m4, m6, m3 350 pminsw m4, m2 351 paddw m0, m7 352 psignw m4, m5 ; constrain(diff_k1s3) 353 paddw m0, m4 ; sum 354 psraw m2, m0, 15 355 paddw m0, m2 356 pmulhrsw m0, m8 357 paddw m0, m1 358%if %1 == 4 359 add tmpq, 32*2 360 movq [dstq+strideq*0], m0 361 movhps [dstq+strideq*1], m0 362 lea dstq, [dstq+strideq*2] 363%else 364 add tmpq, 32 365 mova [dstq], m0 366 add dstq, strideq 367%endif 368 ret 369ALIGN function_align 370.pri_sec: 371 movsx offq, byte [dirq+8] ; off2_k0 372%if %1 == 4 373 movq m1, [dstq+strideq*0] 374 movhps m1, [dstq+strideq*1] 375 movq m2, [tmpq+offq+32*0] ; k0s0 376 movhps m2, [tmpq+offq+32*1] 377 neg offq 378 movq m3, [tmpq+offq+32*0] ; k0s1 379 movhps m3, [tmpq+offq+32*1] 380%else 381 mova m1, [dstq] 382 movu m2, [tmpq+offq] 383 neg offq 384 movu m3, [tmpq+offq] 385%endif 386 movsx offq, byte [dirq+0] ; off3_k0 387 pabsw m4, m2 388%if ARCH_X86_64 389 pabsw m10, m3 390 pmaxsw m9, m2, m3 391 pminsw m10, m4 392%else 393 pabsw m7, m3 394 pmaxsw m5, m2, m3 395 pminsw m4, m7 396 mova m9, m5 397 mova m10, m4 398%endif 399 psubw m2, m1 ; diff_k0s0 400 psubw m3, m1 ; diff_k0s1 401 pabsw m4, m2 ; adiff_k0s0 402 psrlw m5, m4, [sec_shift+gprsize] 403 psubusw m0, m6, m5 404 pabsw m5, m3 ; adiff_k0s1 405 pminsw m0, m4 406 psrlw m4, m5, [sec_shift+gprsize] 407 psignw m0, m2 ; constrain(diff_k0s0) 408 psubusw m2, m6, m4 409 pminsw m2, m5 410%if %1 == 4 411 movq m4, [tmpq+offq+32*0] ; k0s2 412 movhps m4, [tmpq+offq+32*1] 413 neg offq 414 movq m5, [tmpq+offq+32*0] ; k0s3 415 movhps m5, [tmpq+offq+32*1] 416%else 417 movu m4, [tmpq+offq] 418 neg offq 419 movu m5, [tmpq+offq] 420%endif 421 movsx offq, byte [dirq+9] ; off2_k1 422 pabsw m7, m4 423 psignw m2, m3 424 pabsw m3, m5 ; constrain(diff_k0s1) 425%if ARCH_X86_64 426 pmaxsw m9, m4 427 pminsw m10, m7 428 pmaxsw m9, m5 429 pminsw m10, m3 430%else 431 pminsw m7, m10 432 pminsw m7, m3 433 pmaxsw m3, m9, m4 434 pmaxsw m3, m5 435 mova m10, m7 436 mova m9, m3 437%endif 438 psubw m4, m1 ; diff_k0s2 439 psubw m5, m1 ; diff_k0s3 440 paddw m0, m2 441 pabsw m3, m4 ; adiff_k0s2 442 psrlw m2, m3, [sec_shift+gprsize] 443 psubusw m7, m6, m2 444 pabsw m2, m5 ; adiff_k0s3 445 pminsw m7, m3 446 psrlw m3, m2, [sec_shift+gprsize] 447 psignw m7, m4 ; constrain(diff_k0s2) 448 psubusw m4, m6, m3 449 pminsw m4, m2 450%if %1 == 4 451 movq m2, [tmpq+offq+32*0] ; k1s0 452 movhps m2, [tmpq+offq+32*1] 453 neg offq 454 movq m3, [tmpq+offq+32*0] ; k1s1 455 movhps m3, [tmpq+offq+32*1] 456%else 457 movu m2, [tmpq+offq] 458 neg offq 459 movu m3, [tmpq+offq] 460%endif 461 movsx offq, byte [dirq+1] ; off3_k1 462 paddw m0, m7 463 pabsw m7, m2 464 psignw m4, m5 ; constrain(diff_k0s3) 465 pabsw m5, m3 466%if ARCH_X86_64 467 pmaxsw m9, m2 468 pminsw m10, m7 469 pmaxsw m9, m3 470 pminsw m10, m5 471%else 472 pminsw m7, m10 473 pminsw m7, m5 474 pmaxsw m5, m9, m2 475 pmaxsw m5, m3 476 mova m10, m7 477 mova m9, m5 478%endif 479 paddw m0, m4 ; constrain(diff_k0) 480 psubw m2, m1 ; diff_k1s0 481 psubw m3, m1 ; diff_k1s1 482 paddw m0, m0 ; sec_tap_k0 483 pabsw m4, m2 ; adiff_k1s0 484 psrlw m5, m4, [sec_shift+gprsize] 485 psubusw m7, m6, m5 486 pabsw m5, m3 ; adiff_k1s1 487 pminsw m7, m4 488 psrlw m4, m5, [sec_shift+gprsize] 489 psignw m7, m2 ; constrain(diff_k1s0) 490 psubusw m2, m6, m4 491 pminsw m2, m5 492%if %1 == 4 493 movq m4, [tmpq+offq+32*0] ; k1s2 494 movhps m4, [tmpq+offq+32*1] 495 neg offq 496 movq m5, [tmpq+offq+32*0] ; k1s3 497 movhps m5, [tmpq+offq+32*1] 498%else 499 movu m4, [tmpq+offq] 500 neg offq 501 movu m5, [tmpq+offq] 502%endif 503 movsx offq, byte [dirq+4] ; off1_k0 504 paddw m0, m7 505 pabsw m7, m4 506 psignw m2, m3 ; constrain(diff_k1s1) 507 pabsw m3, m5 508%if ARCH_X86_64 509 pmaxsw m9, m4 510 pminsw m10, m7 511 pmaxsw m9, m5 512 pminsw m10, m3 513%else 514 pminsw m7, m10 515 pminsw m7, m3 516 pmaxsw m3, m9, m4 517 pmaxsw m3, m5 518 mova m10, m7 519 mova m9, m3 520%endif 521 psubw m4, m1 ; diff_k1s2 522 psubw m5, m1 ; diff_k1s3 523 pabsw m3, m4 ; adiff_k1s2 524 paddw m0, m2 525 psrlw m2, m3, [sec_shift+gprsize] 526 psubusw m7, m6, m2 527 pabsw m2, m5 ; adiff_k1s3 528 pminsw m7, m3 529 psrlw m3, m2, [sec_shift+gprsize] 530 psignw m7, m4 ; constrain(diff_k1s2) 531 psubusw m4, m6, m3 532 pminsw m4, m2 533 paddw m0, m7 534%if %1 == 4 535 movq m2, [tmpq+offq+32*0] ; k0p0 536 movhps m2, [tmpq+offq+32*1] 537 neg offq 538 movq m3, [tmpq+offq+32*0] ; k0p1 539 movhps m3, [tmpq+offq+32*1] 540%else 541 movu m2, [tmpq+offq] 542 neg offq 543 movu m3, [tmpq+offq] 544%endif 545 movsx offq, byte [dirq+5] ; off1_k1 546 pabsw m7, m2 547 psignw m4, m5 ; constrain(diff_k1s3) 548 pabsw m5, m3 549%if ARCH_X86_64 550 pmaxsw m9, m2 551 pminsw m10, m7 552 pmaxsw m9, m3 553 pminsw m10, m5 554%else 555 pminsw m7, m10 556 pminsw m7, m5 557 pmaxsw m5, m9, m2 558 pmaxsw m5, m3 559 mova m10, m7 560 mova m9, m5 561%endif 562 psubw m2, m1 ; diff_k0p0 563 psubw m3, m1 ; diff_k0p1 564 paddw m0, m4 565 pabsw m4, m2 ; adiff_k0p0 566 psrlw m5, m4, [pri_shift+gprsize] 567 psubusw m7, [rsp+gprsize], m5 568 pabsw m5, m3 ; adiff_k0p1 569 pminsw m7, m4 570 psrlw m4, m5, [pri_shift+gprsize] 571 psignw m7, m2 ; constrain(diff_k0p0) 572 psubusw m2, [rsp+gprsize], m4 573 pminsw m2, m5 574%if %1 == 4 575 movq m4, [tmpq+offq+32*0] ; k1p0 576 movhps m4, [tmpq+offq+32*1] 577 neg offq 578 movq m5, [tmpq+offq+32*0] ; k1p1 579 movhps m5, [tmpq+offq+32*1] 580%else 581 movu m4, [tmpq+offq] 582 neg offq 583 movu m5, [tmpq+offq] 584%endif 585 psignw m2, m3 ; constrain(diff_k0p1) 586 pabsw m3, m4 587 paddw m7, m2 ; constrain(diff_k0) 588 pabsw m2, m5 589%if ARCH_X86_64 590 pmaxsw m9, m4 591 pminsw m10, m3 592 pmaxsw m9, m5 593 pminsw m10, m2 594%else 595 pminsw m3, m10 596 pminsw m3, m2 597 pmaxsw m2, m9, m4 598 pmaxsw m2, m5 599 mova m10, m3 600 mova m9, m2 601%endif 602 psubw m4, m1 ; diff_k1p0 603 psubw m5, m1 ; diff_k1p1 604 pabsw m3, m4 ; adiff_k1p0 605 pmullw m7, [priq+16*0] ; pri_tap_k0 606 paddw m0, m7 607 psrlw m2, m3, [pri_shift+gprsize] 608 psubusw m7, [rsp+16*0+gprsize], m2 609 pabsw m2, m5 ; adiff_k1p1 610 pminsw m7, m3 611 psrlw m3, m2, [pri_shift+gprsize] 612 psignw m7, m4 ; constrain(diff_k1p0) 613 psubusw m4, [rsp+16*0+gprsize], m3 614 pminsw m4, m2 615 psignw m4, m5 ; constrain(diff_k1p1) 616 paddw m7, m4 ; constrain(diff_k1) 617 pmullw m7, [priq+16*1] ; pri_tap_k1 618 paddw m0, m7 ; sum 619 psraw m2, m0, 15 620 paddw m0, m2 621 pmulhrsw m0, m8 622 paddw m0, m1 623%if ARCH_X86_64 624 pmaxsw m9, m1 625 pminsw m0, m9 626%else 627 pmaxsw m2, m9, m1 628 pminsw m0, m2 629%endif 630 pminsw m1, m10 631 pmaxsw m0, m1 632%if %1 == 4 633 add tmpq, 32*2 634 movq [dstq+strideq*0], m0 635 movhps [dstq+strideq*1], m0 636 lea dstq, [dstq+strideq*2] 637%else 638 add tmpq, 32 639 mova [dstq], m0 640 add dstq, strideq 641%endif 642 ret 643%endif 644%endmacro 645 646INIT_XMM ssse3 647%if ARCH_X86_64 648cglobal cdef_filter_4x4_16bpc, 5, 9, 9, 32*10, dst, stride, left, top, bot, \ 649 pri, sec, edge 650 %define px rsp+32*4 651%else 652cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left 653 %define botq topq 654 %define px rsp+32*5 655%endif 656 %define base t0-dir_table 657 %define pri_shift px-16*6 658 %define sec_shift px-16*5 659 mov edged, r9m 660 LEA t0, dir_table 661 movu m0, [dstq+strideq*0] 662 movu m1, [dstq+strideq*1] 663 lea t1, [dstq+strideq*2] 664 movu m2, [t1 +strideq*0] 665 movu m3, [t1 +strideq*1] 666 movddup m7, [base+pw_m16384] 667 mova [px+32*0+0], m0 668 mova [px+32*1+0], m1 669 mova [px+32*2+0], m2 670 mova [px+32*3+0], m3 671 test edgeb, 4 ; HAVE_TOP 672 jz .no_top 673 movifnidn topq, topmp 674 movu m0, [topq+strideq*0] 675 movu m1, [topq+strideq*1] 676 mova [px-32*2+0], m0 677 mova [px-32*1+0], m1 678 test edgeb, 1 ; HAVE_LEFT 679 jz .top_no_left 680 movd m0, [topq+strideq*0-4] 681 movd m1, [topq+strideq*1-4] 682 movd [px-32*2-4], m0 683 movd [px-32*1-4], m1 684 jmp .top_done 685.no_top: 686 mova [px-32*2+0], m7 687 mova [px-32*1+0], m7 688.top_no_left: 689 movd [px-32*2-4], m7 690 movd [px-32*1-4], m7 691.top_done: 692 test edgeb, 8 ; HAVE_BOTTOM 693 jz .no_bottom 694 movifnidn botq, r4mp 695 movu m0, [botq+strideq*0] 696 movu m1, [botq+strideq*1] 697 mova [px+32*4+0], m0 698 mova [px+32*5+0], m1 699 test edgeb, 1 ; HAVE_LEFT 700 jz .bottom_no_left 701 movd m0, [botq+strideq*0-4] 702 movd m1, [botq+strideq*1-4] 703 movd [px+32*4-4], m0 704 movd [px+32*5-4], m1 705 jmp .bottom_done 706.no_bottom: 707 mova [px+32*4+0], m7 708 mova [px+32*5+0], m7 709.bottom_no_left: 710 movd [px+32*4-4], m7 711 movd [px+32*5-4], m7 712.bottom_done: 713 test edgeb, 1 ; HAVE_LEFT 714 jz .no_left 715 movifnidn leftq, r2mp 716 movd m0, [leftq+4*0] 717 movd m1, [leftq+4*1] 718 movd m2, [leftq+4*2] 719 movd m3, [leftq+4*3] 720 movd [px+32*0-4], m0 721 movd [px+32*1-4], m1 722 movd [px+32*2-4], m2 723 movd [px+32*3-4], m3 724 jmp .left_done 725.no_left: 726 REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3 727.left_done: 728 test edgeb, 2 ; HAVE_RIGHT 729 jnz .padding_done 730 REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5 731.padding_done: 732 CDEF_FILTER 4, 4 733 734%if ARCH_X86_64 735cglobal cdef_filter_4x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \ 736 pri, sec, edge 737%else 738cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left 739%endif 740 mov edged, r9m 741 LEA t0, dir_table 742 movu m0, [dstq+strideq*0] 743 movu m1, [dstq+strideq*1] 744 lea t1, [dstq+strideq*2] 745 movu m2, [t1 +strideq*0] 746 movu m3, [t1 +strideq*1] 747 lea t1, [t1 +strideq*2] 748 movu m4, [t1 +strideq*0] 749 movu m5, [t1 +strideq*1] 750 lea t1, [t1 +strideq*2] 751 movu m6, [t1 +strideq*0] 752 movu m7, [t1 +strideq*1] 753 mova [px+32*0+0], m0 754 mova [px+32*1+0], m1 755 mova [px+32*2+0], m2 756 mova [px+32*3+0], m3 757 mova [px+32*4+0], m4 758 mova [px+32*5+0], m5 759 mova [px+32*6+0], m6 760 mova [px+32*7+0], m7 761 movddup m7, [base+pw_m16384] 762 test edgeb, 4 ; HAVE_TOP 763 jz .no_top 764 movifnidn topq, topmp 765 movu m0, [topq+strideq*0] 766 movu m1, [topq+strideq*1] 767 mova [px-32*2+0], m0 768 mova [px-32*1+0], m1 769 test edgeb, 1 ; HAVE_LEFT 770 jz .top_no_left 771 movd m0, [topq+strideq*0-4] 772 movd m1, [topq+strideq*1-4] 773 movd [px-32*2-4], m0 774 movd [px-32*1-4], m1 775 jmp .top_done 776.no_top: 777 mova [px-32*2+0], m7 778 mova [px-32*1+0], m7 779.top_no_left: 780 movd [px-32*2-4], m7 781 movd [px-32*1-4], m7 782.top_done: 783 test edgeb, 8 ; HAVE_BOTTOM 784 jz .no_bottom 785 movifnidn botq, r4mp 786 movu m0, [botq+strideq*0] 787 movu m1, [botq+strideq*1] 788 mova [px+32*8+0], m0 789 mova [px+32*9+0], m1 790 test edgeb, 1 ; HAVE_LEFT 791 jz .bottom_no_left 792 movd m0, [botq+strideq*0-4] 793 movd m1, [botq+strideq*1-4] 794 movd [px+32*8-4], m0 795 movd [px+32*9-4], m1 796 jmp .bottom_done 797.no_bottom: 798 mova [px+32*8+0], m7 799 mova [px+32*9+0], m7 800.bottom_no_left: 801 movd [px+32*8-4], m7 802 movd [px+32*9-4], m7 803.bottom_done: 804 test edgeb, 1 ; HAVE_LEFT 805 jz .no_left 806 movifnidn leftq, r2mp 807 movd m0, [leftq+4*0] 808 movd m1, [leftq+4*1] 809 movd m2, [leftq+4*2] 810 movd m3, [leftq+4*3] 811 movd [px+32*0-4], m0 812 movd [px+32*1-4], m1 813 movd [px+32*2-4], m2 814 movd [px+32*3-4], m3 815 movd m0, [leftq+4*4] 816 movd m1, [leftq+4*5] 817 movd m2, [leftq+4*6] 818 movd m3, [leftq+4*7] 819 movd [px+32*4-4], m0 820 movd [px+32*5-4], m1 821 movd [px+32*6-4], m2 822 movd [px+32*7-4], m3 823 jmp .left_done 824.no_left: 825 REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7 826.left_done: 827 test edgeb, 2 ; HAVE_RIGHT 828 jnz .padding_done 829 REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 830.padding_done: 831 CDEF_FILTER 4, 8 832 833%if ARCH_X86_64 834cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \ 835 pri, sec, edge 836%else 837cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left 838%endif 839 mov edged, r9m 840 LEA t0, dir_table 841 mova m0, [dstq+strideq*0+ 0] 842 movd m1, [dstq+strideq*0+16] 843 mova m2, [dstq+strideq*1+ 0] 844 movd m3, [dstq+strideq*1+16] 845 lea t1, [dstq+strideq*2] 846 mova m4, [t1 +strideq*0+ 0] 847 movd m5, [t1 +strideq*0+16] 848 mova m6, [t1 +strideq*1+ 0] 849 movd m7, [t1 +strideq*1+16] 850 lea t1, [t1 +strideq*2] 851 mova [px+32*0+ 0], m0 852 movd [px+32*0+16], m1 853 mova [px+32*1+ 0], m2 854 movd [px+32*1+16], m3 855 mova [px+32*2+ 0], m4 856 movd [px+32*2+16], m5 857 mova [px+32*3+ 0], m6 858 movd [px+32*3+16], m7 859 mova m0, [t1 +strideq*0+ 0] 860 movd m1, [t1 +strideq*0+16] 861 mova m2, [t1 +strideq*1+ 0] 862 movd m3, [t1 +strideq*1+16] 863 lea t1, [t1 +strideq*2] 864 mova m4, [t1 +strideq*0+ 0] 865 movd m5, [t1 +strideq*0+16] 866 mova m6, [t1 +strideq*1+ 0] 867 movd m7, [t1 +strideq*1+16] 868 mova [px+32*4+ 0], m0 869 movd [px+32*4+16], m1 870 mova [px+32*5+ 0], m2 871 movd [px+32*5+16], m3 872 mova [px+32*6+ 0], m4 873 movd [px+32*6+16], m5 874 mova [px+32*7+ 0], m6 875 movd [px+32*7+16], m7 876 movddup m7, [base+pw_m16384] 877 test edgeb, 4 ; HAVE_TOP 878 jz .no_top 879 movifnidn topq, topmp 880 mova m0, [topq+strideq*0+ 0] 881 mova m1, [topq+strideq*0+16] 882 mova m2, [topq+strideq*1+ 0] 883 mova m3, [topq+strideq*1+16] 884 mova [px-32*2+ 0], m0 885 movd [px-32*2+16], m1 886 mova [px-32*1+ 0], m2 887 movd [px-32*1+16], m3 888 test edgeb, 1 ; HAVE_LEFT 889 jz .top_no_left 890 movd m0, [topq+strideq*0-4] 891 movd m1, [topq+strideq*1-4] 892 movd [px-32*2-4], m0 893 movd [px-32*1-4], m1 894 jmp .top_done 895.no_top: 896 mova [px-32*2+ 0], m7 897 movd [px-32*2+16], m7 898 mova [px-32*1+ 0], m7 899 movd [px-32*1+16], m7 900.top_no_left: 901 movd [px-32*2- 4], m7 902 movd [px-32*1- 4], m7 903.top_done: 904 test edgeb, 8 ; HAVE_BOTTOM 905 jz .no_bottom 906 movifnidn botq, r4mp 907 mova m0, [botq+strideq*0+ 0] 908 movd m1, [botq+strideq*0+16] 909 mova m2, [botq+strideq*1+ 0] 910 movd m3, [botq+strideq*1+16] 911 mova [px+32*8+ 0], m0 912 movd [px+32*8+16], m1 913 mova [px+32*9+ 0], m2 914 movd [px+32*9+16], m3 915 test edgeb, 1 ; HAVE_LEFT 916 jz .bottom_no_left 917 movd m0, [botq+strideq*0-4] 918 movd m1, [botq+strideq*1-4] 919 movd [px+32*8- 4], m0 920 movd [px+32*9- 4], m1 921 jmp .bottom_done 922.no_bottom: 923 mova [px+32*8+ 0], m7 924 movd [px+32*8+16], m7 925 mova [px+32*9+ 0], m7 926 movd [px+32*9+16], m7 927.bottom_no_left: 928 movd [px+32*8- 4], m7 929 movd [px+32*9- 4], m7 930.bottom_done: 931 test edgeb, 1 ; HAVE_LEFT 932 jz .no_left 933 movifnidn leftq, r2mp 934 movd m0, [leftq+4*0] 935 movd m1, [leftq+4*1] 936 movd m2, [leftq+4*2] 937 movd m3, [leftq+4*3] 938 movd [px+32*0- 4], m0 939 movd [px+32*1- 4], m1 940 movd [px+32*2- 4], m2 941 movd [px+32*3- 4], m3 942 movd m0, [leftq+4*4] 943 movd m1, [leftq+4*5] 944 movd m2, [leftq+4*6] 945 movd m3, [leftq+4*7] 946 movd [px+32*4- 4], m0 947 movd [px+32*5- 4], m1 948 movd [px+32*6- 4], m2 949 movd [px+32*7- 4], m3 950 jmp .left_done 951.no_left: 952 REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7 953.left_done: 954 test edgeb, 2 ; HAVE_RIGHT 955 jnz .padding_done 956 REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 957.padding_done: 958 CDEF_FILTER 8, 8 959 960%macro CDEF_DIR 0 961%if ARCH_X86_64 962cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax 963 lea r6, [dir_shift] 964 shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc 965 movddup m7, [r6+bdmaxq*8] 966 lea r6, [strideq*3] 967 mova m0, [srcq+strideq*0] 968 mova m1, [srcq+strideq*1] 969 mova m2, [srcq+strideq*2] 970 mova m3, [srcq+r6 ] 971 lea srcq, [srcq+strideq*4] 972 mova m4, [srcq+strideq*0] 973 mova m5, [srcq+strideq*1] 974 mova m6, [srcq+strideq*2] 975 REPX {pmulhuw x, m7}, m0, m1, m2, m3, m4, m5, m6 976 pmulhuw m7, [srcq+r6 ] 977 pxor m8, m8 978 packuswb m9, m0, m1 979 packuswb m10, m2, m3 980 packuswb m11, m4, m5 981 packuswb m12, m6, m7 982 REPX {psadbw x, m8}, m9, m10, m11, m12 983 packssdw m9, m10 984 packssdw m11, m12 985 packssdw m9, m11 986 jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main 987%else 988cglobal cdef_dir_16bpc, 2, 4, 8, 96, src, stride, var, bdmax 989 mov bdmaxd, bdmaxm 990 LEA r2, dir_shift 991 shr bdmaxd, 11 992 movddup m7, [r2+bdmaxq*8] 993 lea r3, [strideq*3] 994 pmulhuw m3, m7, [srcq+strideq*0] 995 pmulhuw m4, m7, [srcq+strideq*1] 996 pmulhuw m5, m7, [srcq+strideq*2] 997 pmulhuw m6, m7, [srcq+r3 ] 998 movddup m1, [r2-dir_shift+pw_128] 999 lea srcq, [srcq+strideq*4] 1000 pxor m0, m0 1001 packuswb m2, m3, m4 1002 psubw m3, m1 1003 psubw m4, m1 1004 mova [esp+0x00], m3 1005 mova [esp+0x10], m4 1006 packuswb m3, m5, m6 1007 psadbw m2, m0 1008 psadbw m3, m0 1009 psubw m5, m1 1010 psubw m6, m1 1011 packssdw m2, m3 1012 mova [esp+0x20], m5 1013 mova [esp+0x50], m6 1014 pmulhuw m4, m7, [srcq+strideq*0] 1015 pmulhuw m5, m7, [srcq+strideq*1] 1016 pmulhuw m6, m7, [srcq+strideq*2] 1017 pmulhuw m7, [srcq+r3 ] 1018 packuswb m3, m4, m5 1019 packuswb m1, m6, m7 1020 psadbw m3, m0 1021 psadbw m1, m0 1022 packssdw m3, m1 1023 movddup m1, [r2-dir_shift+pw_128] 1024 LEA r2, shufw_6543210x 1025 jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main 1026%endif 1027%endmacro 1028 1029INIT_XMM ssse3 1030CDEF_DIR 1031 1032INIT_XMM sse4 1033CDEF_DIR 1034