1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 16 30 31%if ARCH_X86_64 32%define PIC_sym(a) a 33%else 34%define PIC_base $$ 35%define PIC_sym(a) pic_regq+a-PIC_base 36%endif 37 38pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 39 times 4 db 8, 9 40 41pw_1: times 8 dw 1 42pw_2: times 8 dw 2 43pw_3: times 8 dw 3 44; 4 and 16 need to be next to each other since they are used as alternates 45; depending on whether bitdepth is 10 or 12 46pw_4: times 8 dw 4 47pw_16: times 8 dw 16 48pw_8: times 8 dw 8 49pw_4096: times 8 dw 4096 50 51pb_mask: dd 1, 1, 2, 2 52 53SECTION .text 54 55%if ARCH_X86_32 56%if STACK_ALIGNMENT < 16 57%define extra_stack 2 58%else 59%define extra_stack 0 60%endif 61%endif 62 63%macro RELOC_ARGS 2 ; h/v, off 64ASSERT ARCH_X86_32 65%if STACK_ALIGNMENT < 16 66 mov r5d, [rstk + stack_offset + 4*4 + 4] 67%define lstridem [esp+%2+0*gprsize] 68 mov lstridem, r5d 69 mov r5d, [rstk + stack_offset + 4*5 + 4] 70%define lutm [esp+%2+1*gprsize] 71 mov lutm, r5d 72 mov r5d, [rstk + stack_offset + 4*6 + 4] 73%ifidn %1, v 74%define wm [esp+%2+2*gprsize] 75 mov wm, r5d 76 mov r5d, [rstk + stack_offset + 4*3 + 4] 77%define lm [esp+%2+3*gprsize] 78 mov lm, r5d 79%else ; %1 == h 80%define hm [esp+%2+2*gprsize] 81 mov hm, r5d 82%endif ; %1==v 83 mov r5d, r7m 84%define bdmulm [esp+%2+4*gprsize] 85 mov bdmulm, r5d 86%else 87%define lstridem r4m 88%define lutm r5m 89%ifidn %1, v 90%define wm r6m 91%define lm r3m 92%else 93%define hm r6m 94%endif 95%define bdmulm r7m 96%endif ; STACK_ALIGNMENT 97%endmacro 98 99%macro UNRELOC_ARGS 0 100%if ARCH_X86_32 101%undef lm 102%undef lstridem 103%undef wm 104%undef hm 105%undef lutm 106%endif 107%endmacro 108 109%macro SPLATD 2 110 movd %1, %2 111 pshufd %1, %1, q0000 112%endmacro 113 114%macro SPLATW 2 115 movd %1, %2 116 pshuflw %1, %1, q0000 117 punpcklqdq %1, %1 118%endmacro 119 120; in: out: 121; mm%1 a b c d a e i m 122; mm%2 e f g h b f j n 123; mm%3 i j k l -> c g k o 124; mm%4 m n o p d h l p 125%macro TRANSPOSE4X4W 5 126 punpcklwd m%5, m%1, m%2 127 punpckhwd m%1, m%2 128 punpcklwd m%2, m%3, m%4 129 punpckhwd m%3, m%4 130 punpckldq m%4, m%5, m%2 131 punpckhdq m%5, m%2 132 punpckldq m%2, m%1, m%3 133 punpckhdq m%1, m%3 134 135 SWAP %1, %4 136 SWAP %2, %5, %3 137%endmacro 138 139; in: out: 140; m%1 a b c d e f g h a i q y 6 E M U 141; m%2 i j k l m n o p b j r z 7 F N V 142; m%3 q r s t u v w x c k s 0 8 G O W 143; m%4 y z 0 1 2 3 4 5 d l t 1 9 H P X 144; m%5 6 7 8 9 A B C D -> e m u 2 A I Q Y 145; m%6 E F G H I J K L f n v 3 B J R Z 146; m%7 M N O P Q R S T g o w 4 C K S + 147; m%8 U V W X Y Z + = h p x 5 D L T = 148%if ARCH_X86_64 149%macro TRANSPOSE8X8W 9 150 ; m%1 a b c d e f g h a i q y b j r z 151 ; m%2 i j k l m n o p c k s 0 d l t 1 152 ; m%3 q r s t u v w x -> e m u 2 f n v 3 153 ; m%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 154 TRANSPOSE4X4W %1, %2, %3, %4, %9 155 156 ; m%5 6 7 8 9 A B C D 6 E M U 7 F N V 157 ; m%6 E F G H I J K L 8 G O W 9 H P X 158 ; m%7 M N O P Q R S T -> A I Q Y B J R Z 159 ; m%8 U V W X Y Z + = C K S + D L T = 160 TRANSPOSE4X4W %5, %6, %7, %8, %9 161 162 ; m%1 a i q y b j r z a i q y 6 E M U 163 ; m%2 c k s 0 d l t 1 b j r z 7 F N V 164 ; m%3 e m u 2 f n v 3 c k s 0 8 G O W 165 ; m%4 g o w 4 h p x 5 d l t 1 9 H P X 166 ; m%5 6 E M U 7 F N V -> e m u 2 A I Q Y 167 ; m%6 8 G O W 9 H P X f n v 3 B J R Z 168 ; m%7 A I Q Y B J R Z g o w 4 C K S + 169 ; m%8 C K S + D L T = h p x 5 D L T = 170 punpckhqdq m%9, m%1, m%5 171 punpcklqdq m%1, m%5 172 punpckhqdq m%5, m%2, m%6 173 punpcklqdq m%2, m%6 174 punpckhqdq m%6, m%3, m%7 175 punpcklqdq m%3, m%7 176 punpckhqdq m%7, m%4, m%8 177 punpcklqdq m%4, m%8 178 179 SWAP %8, %7, %4, %5, %3, %2, %9 180%endmacro 181%else ; x86-32 182; input: 1-7 in registers, 8 in first memory [read-only] 183; second memory is scratch, and may overlap with first or third memory 184; output: 1-5,7-8 in registers, 6 in third memory [write-only] 185%macro TRANSPOSE8X8W 13 ; regs [8x], mem [3x], a/u [in/out alignment [2x] 186 TRANSPOSE4X4W %1, %2, %3, %4, %8 187%ifnidn %9, "" 188 mov%12 m%8, %9 189%else 190 mova m%8, %10 191%endif 192 mova %10, m%4 193 TRANSPOSE4X4W %5, %6, %7, %8, %4 194 punpckhqdq m%4, m%1, m%5 195 punpcklqdq m%1, m%5 196 punpckhqdq m%5, m%2, m%6 197 punpcklqdq m%2, m%6 198 punpckhqdq m%6, m%3, m%7 199 punpcklqdq m%3, m%7 200 mova m%7, %10 201%ifnidn %11, "" 202 mov%13 %11, m%6 203%else 204 mova %10, m%6 205%endif 206 punpckhqdq m%6, m%7, m%8 207 punpcklqdq m%7, m%8 208 209 ; 1,4,2,5,3,8,7,6 -> 1,2,3,4,5,6,7,8 210 SWAP %2, %4, %5, %3 211 SWAP %6, %8 212%endmacro 213%endif ; x86-32/64 214 215; transpose and write m8-11, everything else is scratch 216%macro TRANSPOSE_8x4_AND_WRITE_4x8 5 ; p1, p0, q0, q1, tmp 217 ; transpose 8x4 218 punpcklwd %5, %1, %2 219 punpckhwd %1, %2 220 punpcklwd %2, %3, %4 221 punpckhwd %3, %4 222 punpckldq %4, %5, %2 223 punpckhdq %5, %2 224 punpckldq %2, %1, %3 225 punpckhdq %1, %3 226 227 ; write out 228 movq [dstq+strideq*0-4], %4 229 movhps [dstq+strideq*1-4], %4 230 movq [dstq+strideq*2-4], %5 231 movhps [dstq+stride3q -4], %5 232 lea dstq, [dstq+strideq*4] 233 movq [dstq+strideq*0-4], %2 234 movhps [dstq+strideq*1-4], %2 235 movq [dstq+strideq*2-4], %1 236 movhps [dstq+stride3q -4], %1 237 lea dstq, [dstq+strideq*4] 238%endmacro 239 240%macro FILTER 2 ; width [4/6/8/16], dir [h/v] 241 ; load data 242%ifidn %2, v 243%if %1 == 4 244%if ARCH_X86_64 245%define P1 m8 246%define P0 m9 247%define Q0 m10 248%define Q1 m11 249 mova P1, [dstq+mstrideq*2] ; p1 250 mova P0, [dstq+mstrideq*1] ; p0 251 mova Q0, [dstq+strideq*0] ; q0 252 mova Q1, [dstq+strideq*1] ; q1 253%else ; x86-32 254%define P1 [dstq+mstrideq*2] 255%define P0 [dstq+mstrideq*1] 256%define Q0 [dstq+strideq*0] 257%define Q1 [dstq+strideq*1] 258%endif ; x86-32/64 259%else ; %1 != 4 260 ; load 6-8 pixels, remainder (for wd=16) will be read inline 261 lea tmpq, [dstq+mstrideq*4] 262%if ARCH_X86_64 263 ; we load p3 later 264%define P2 m13 265%define P1 m8 266%define P0 m9 267%define Q0 m10 268%define Q1 m11 269%define Q2 m14 270 mova P2, [tmpq+strideq*1] 271 mova P1, [tmpq+strideq*2] 272 mova P0, [tmpq+stride3q] 273 mova Q0, [dstq+strideq*0] 274 mova Q1, [dstq+strideq*1] 275 mova Q2, [dstq+strideq*2] 276%if %1 != 6 277%define P3 [tmpq+strideq*0] 278%define Q3 m15 279 mova Q3, [dstq+stride3q] 280%endif ; %1 != 6 281%else ; x86-32 282%define P2 [tmpq+strideq*1] 283%define P1 [dstq+mstrideq*2] 284%define P0 [dstq+mstrideq*1] 285%define Q0 [dstq+strideq*0] 286%define Q1 [dstq+strideq*1] 287%define Q2 [dstq+strideq*2] 288%if %1 != 6 289%define P3 [dstq+mstrideq*4] 290%define Q3 [dstq+stride3q] 291%endif ; %1 != 6 292%endif ; x86-32/64 293%endif ; %1 ==/!= 4 294%else ; %2 != v 295 ; load lines 296%if %1 == 4 297 movq m0, [dstq+strideq*0-4] 298 movq m2, [dstq+strideq*1-4] 299 movq m4, [dstq+strideq*2-4] 300 movq m5, [dstq+stride3q -4] 301 lea tmpq, [dstq+strideq*4] 302 movq m3, [tmpq+strideq*0-4] 303 movq m6, [tmpq+strideq*1-4] 304 movq m1, [tmpq+strideq*2-4] 305 movq m7, [tmpq+stride3q -4] 306 307 ; transpose 4x8 308 ; m0: A-D0 309 ; m2: A-D1 310 ; m4: A-D2 311 ; m5: A-D3 312 ; m3: A-D4 313 ; m6: A-D5 314 ; m1: A-D6 315 ; m7: A-D7 316 punpcklwd m0, m2 317 punpcklwd m4, m5 318 punpcklwd m3, m6 319 punpcklwd m1, m7 320 ; m0: A0-1,B0-1,C0-1,D0-1 321 ; m4: A2-3,B2-3,C2-3,D2-3 322 ; m3: A4-5,B4-5,C4-5,D4-5 323 ; m1: A6-7,B6-7,C6-7,D6-7 324 punpckhdq m2, m0, m4 325 punpckldq m0, m4 326 punpckhdq m4, m3, m1 327 punpckldq m3, m1 328 ; m0: A0-3,B0-3 329 ; m2: C0-3,D0-3 330 ; m3: A4-7,B4-7 331 ; m4: C4-7,D4-7 332 punpckhqdq m1, m0, m3 333 punpcklqdq m0, m3 334 punpckhqdq m3, m2, m4 335 punpcklqdq m2, m4 336 ; m0: A0-7 337 ; m1: B0-7 338 ; m2: C0-7 339 ; m3: D0-7 340%if ARCH_X86_64 341 SWAP 0, 8 342 SWAP 1, 9 343 SWAP 2, 10 344 SWAP 3, 11 345%define P1 m8 346%define P0 m9 347%define Q0 m10 348%define Q1 m11 349%else 350%define P1 [esp+3*mmsize] 351%define P0 [esp+4*mmsize] 352%define Q0 [esp+5*mmsize] 353%define Q1 [esp+6*mmsize] 354 mova P1, m0 355 mova P0, m1 356 mova Q0, m2 357 mova Q1, m3 358%endif 359%elif %1 == 6 || %1 == 8 360 movu m0, [dstq+strideq*0-8] 361 movu m1, [dstq+strideq*1-8] 362 movu m2, [dstq+strideq*2-8] 363 movu m3, [dstq+stride3q -8] 364 lea tmpq, [dstq+strideq*4] 365 movu m4, [tmpq+strideq*0-8] 366 movu m5, [tmpq+strideq*1-8] 367 movu m6, [tmpq+strideq*2-8] 368%if ARCH_X86_64 369 movu m7, [tmpq+stride3q -8] 370%endif 371 372 ; transpose 8x16 373 ; m0: A-H0,A-H8 374 ; m1: A-H1,A-H9 375 ; m2: A-H2,A-H10 376 ; m3: A-H3,A-H11 377 ; m4: A-H4,A-H12 378 ; m5: A-H5,A-H13 379 ; m6: A-H6,A-H14 380 ; m7: A-H7,A-H15 381%if ARCH_X86_64 382 punpcklwd m8, m0, m1 383%else 384 punpcklwd m7, m0, m1 385%endif 386 punpckhwd m0, m1 387 punpcklwd m1, m2, m3 388 punpckhwd m2, m3 389 punpcklwd m3, m4, m5 390 punpckhwd m4, m5 391%if ARCH_X86_64 392 punpcklwd m5, m6, m7 393 punpckhwd m6, m7 394%else 395 mova [rsp+3*16], m4 396 movu m4, [tmpq+stride3q -8] 397 punpcklwd m5, m6, m4 398 punpckhwd m6, m4 399%endif 400 ; m8: A0-1,B0-1,C0-1,D0-1 [m7 on x86-32] 401 ; m0: E0-1,F0-1,G0-1,H0-1 402 ; m1: A2-3,B2-3,C2-3,D2-3 403 ; m2: E2-3,F2-3,G2-3,H2-3 404 ; m3: A4-5,B4-5,C4-5,D4-5 405 ; m4: E4-5,F4-5,G4-5,H4-5 [r3 on x86-32] 406 ; m5: A6-7,B6-7,C6-7,D6-7 407 ; m6: E6-7,F6-7,G6-7,H6-7 408%if ARCH_X86_64 409 punpckldq m7, m8, m1 410 punpckhdq m8, m1 411%else 412 punpckldq m4, m7, m1 413 punpckhdq m7, m1 414%endif 415 punpckldq m1, m0, m2 416 punpckhdq m0, m2 417 punpckldq m2, m3, m5 418 punpckhdq m3, m5 419%if ARCH_X86_64 420 punpckldq m5, m4, m6 421 punpckhdq m4, m6 422%else 423 mova [rsp+4*16], m3 424 mova m3, [rsp+3*16] 425 punpckldq m5, m3, m6 426 punpckhdq m3, m6 427%endif 428 ; m7: A0-3,B0-3 [m4 on x86-32] 429 ; m8: C0-3,D0-3 [m7 on x86-32] 430 ; m1: E0-3,F0-3 431 ; m0: G0-3,H0-3 432 ; m2: A4-7,B4-7 433 ; m3: C4-7,D4-7 [r4 on x86-32] 434 ; m5: E4-7,F4-7 435 ; m4: G4-7,H4-7 [m3 on x86-32] 436%if ARCH_X86_64 437%if %1 != 6 438 punpcklqdq m6, m7, m2 439%endif 440 punpckhqdq m7, m2 441 punpcklqdq m2, m8, m3 442 punpckhqdq m8, m3 443 punpcklqdq m3, m1, m5 444 punpckhqdq m1, m5 445%if %1 != 6 446 punpckhqdq m5, m0, m4 447%endif 448 punpcklqdq m0, m4 449%if %1 == 8 450 mova [rsp+1*16], m6 451%define P3 [rsp+1*16] 452%endif 453 ; 7,2,8,3,1,0,5 -> 13,8,9,10,11,14,15 454 SWAP 7, 13 455 SWAP 8, 2, 9 456 SWAP 3, 10 457 SWAP 1, 11 458 SWAP 0, 14 459 SWAP 5, 15 460%define P2 m13 461%define P1 m8 462%define P0 m9 463%define Q0 m10 464%define Q1 m11 465%define Q2 m14 466%if %1 == 8 467%define Q3 m15 468%endif 469%else ; x86-32 470%if %1 == 8 471%define P3 [rsp+ 6*16] 472 punpcklqdq m6, m4, m2 473 mova P3, m6 474%endif 475 mova m6, [rsp+4*16] 476 punpckhqdq m4, m2 477 punpcklqdq m2, m7, m6 478 punpckhqdq m7, m6 479 punpcklqdq m6, m1, m5 480 punpckhqdq m1, m5 481%if %1 == 8 482%define Q3 [rsp+24*16] 483 punpckhqdq m5, m0, m3 484 mova Q3, m5 485%endif 486 punpcklqdq m0, m3 487%if %1 == 8 488%define P2 [rsp+18*16] 489%define P1 [rsp+19*16] 490%define P0 [rsp+20*16] 491%define Q0 [rsp+21*16] 492%define Q1 [rsp+22*16] 493%define Q2 [rsp+23*16] 494%else 495%define P2 [rsp+3*16] 496%define P1 [rsp+4*16] 497%define P0 [rsp+5*16] 498%define Q0 [rsp+6*16] 499%define Q1 [rsp+7*16] 500%define Q2 [rsp+8*16] 501%endif 502 mova P2, m4 503 mova P1, m2 504 mova P0, m7 505 mova Q0, m6 506 mova Q1, m1 507 mova Q2, m0 508%endif ; x86-32/64 509%else ; %1 == 16 510 ; We only use 14 pixels but we'll need the remainder at the end for 511 ; the second transpose 512 mova m0, [dstq+strideq*0-16] 513 mova m1, [dstq+strideq*1-16] 514 mova m2, [dstq+strideq*2-16] 515 mova m3, [dstq+stride3q -16] 516 lea tmpq, [dstq+strideq*4] 517 mova m4, [tmpq+strideq*0-16] 518 mova m5, [tmpq+strideq*1-16] 519 mova m6, [tmpq+strideq*2-16] 520%if ARCH_X86_64 521 mova m7, [tmpq+stride3q -16] 522 523 TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8 524 SWAP 5, 13 525 SWAP 6, 8 526 SWAP 7, 9 527%define P2 m13 528%define P1 m8 529%define P0 m9 530%else ; x86-32 531%define P2 [esp+18*16] 532%define P1 [esp+19*16] 533%define P0 [esp+20*16] 534 TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ 535 [tmpq+stride3q -16], P2, "", a, a 536 mova P1, m6 537 mova P0, m7 538%endif ; x86-32/64 539 mova [rsp+ 7*16], m0 540 mova [rsp+ 8*16], m1 541 mova [rsp+ 9*16], m2 542 mova [rsp+10*16], m3 543%define P3 [rsp+6*16] 544 mova P3, m4 545 546 mova m0, [dstq+strideq*0] 547 mova m1, [dstq+strideq*1] 548 mova m2, [dstq+strideq*2] 549 mova m3, [dstq+stride3q ] 550 lea tmpq, [dstq+strideq*4] 551 mova m4, [tmpq+strideq*0] 552 mova m5, [tmpq+strideq*1] 553 mova m6, [tmpq+strideq*2] 554%if ARCH_X86_64 555 mova m7, [tmpq+stride3q ] 556 557 TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 10 558 SWAP 0, 10 559 SWAP 1, 11 560 SWAP 2, 14 561 SWAP 3, 15 562%define Q0 m10 563%define Q1 m11 564%define Q2 m14 565%define Q3 m15 566%else ; x86-32 567 TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ 568 [tmpq+stride3q ], [rsp+12*16], "", a, a 569%define Q0 [esp+21*16] 570%define Q1 [esp+22*16] 571%define Q2 [esp+23*16] 572%define Q3 [esp+24*16] 573 mova Q0, m0 574 mova Q1, m1 575 mova Q2, m2 576 mova Q3, m3 577%endif ; x86-32/64 578 579 mova [rsp+11*16], m4 580%if ARCH_X86_64 581 mova [rsp+12*16], m5 582%endif 583 mova [rsp+13*16], m6 584 mova [rsp+14*16], m7 585%endif ; %1 == 4/6/8/16 586%endif ; %2 ==/!= v 587 588 ; load L/E/I/H 589%if ARCH_X86_32 590%define l_strideq r5 591 mov l_strideq, dword lstridem 592%ifidn %2, v 593%define lq r3 594 mov lq, dword lm 595%endif 596%endif 597%ifidn %2, v 598%if cpuflag(sse4) 599 pmovzxbw m1, [lq] 600 pmovzxbw m0, [lq+l_strideq] 601 pxor m2, m2 602%else ; ssse3 603 movq m1, [lq] 604 movq m0, [lq+l_strideq] 605 pxor m2, m2 606 REPX {punpcklbw x, m2}, m1, m0 607%endif ; ssse3/sse4 608%else ; %2 != v 609 movq m0, [lq] ; l0, l1 610 movq m1, [lq+l_strideq] ; l2, l3 611 punpckldq m0, m1 ; l0, l2, l1, l3 612 pxor m2, m2 613 punpcklbw m1, m0, m2 ; l0, l2 614 punpckhbw m0, m2 ; l1, l3 615%endif ; %2==/!=v 616%if ARCH_X86_32 617%ifidn %2, v 618%undef lq 619 mov mstrideq, mstridem 620%endif 621%endif 622 pcmpeqw m5, m2, m0 623 pand m1, m5 624 por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] 625 pshufb m0, [PIC_sym(pb_4x1_4x5_4x9_4x13)] ; l[x][1] 626 pcmpeqw m5, m2, m0 ; !L 627 psrlw m5, 1 628%if ARCH_X86_64 629 psrlw m2, m0, [lutq+128] 630 SPLATW m1, [lutq+136] 631%else ; x86-32 632 mov r5, lutm 633 psrlw m2, m0, [r5+128] 634 SPLATW m1, [r5+136] 635%endif ; x86-32/64 636 pminsw m2, m1 637 pmaxsw m2, [PIC_sym(pw_1)] ; I 638 psrlw m1, m0, 4 ; H 639 paddw m0, [PIC_sym(pw_2)] 640 paddw m0, m0 641 paddw m0, m2 ; E 642 REPX {pmullw x, [bdmulq]}, m0, m1, m2 643%if ARCH_X86_32 644%undef l_strideq 645 lea stride3q, [strideq*3] 646%endif 647 648 psubw m3, P1, P0 ; p1-p0 649 psubw m4, Q0, Q1 ; q0-q1 650 REPX {pabsw x, x}, m3, m4 651 pmaxsw m3, m5 652 pmaxsw m3, m4 653 pcmpgtw m7, m3, m1 ; hev 654%if %1 != 4 655 psubw m4, P2, P0 ; p2-p0 656 pabsw m4, m4 657 pmaxsw m4, m3 658%if %1 != 6 659 mova m6, P3 ; p3 660 psubw m5, m6, P0 ; p3-p0 661 pabsw m5, m5 662 pmaxsw m4, m5 663%endif ; %1 != 6 664 psubw m5, Q0, Q2 ; q0-q2 665 pabsw m5, m5 666 pmaxsw m4, m5 667%if %1 != 6 668 psubw m5, Q0, Q3 ; q0-q3 669 pabsw m5, m5 670 pmaxsw m4, m5 671%endif ; %1 != 6 672 pcmpgtw m4, [bdmulq] ; !flat8in 673 674 psubw m5, P2, P1 ; p2-p1 675 pabsw m5, m5 676%if %1 != 6 677 psubw m6, P2 ; p3-p2 678 pabsw m6, m6 679 pmaxsw m5, m6 680 psubw m6, Q2, Q3 ; q2-q3 681 pabsw m6, m6 682 pmaxsw m5, m6 683%endif ; %1 != 6 684 psubw m6, Q2, Q1 ; q2-q1 685 pabsw m6, m6 686 pmaxsw m5, m6 687 688%if %1 == 16 689 SPLATD m6, [maskq+8] 690 SPLATD m1, [maskq+4] 691 por m6, m1 692 pand m6, m12 693 pcmpeqd m6, m12 694 pand m5, m6 695%else ; %1 != 16 696 SPLATD m6, [maskq+4] 697 pand m6, m12 698 pcmpeqd m6, m12 699 pand m5, m6 ; only apply fm-wide to wd>4 blocks 700%endif ; %1==/!=16 701 pmaxsw m3, m5 702%endif ; %1 != 4 703 pcmpgtw m3, m2 704 705 psubw m5, P1, Q1 ; p1-q1 706 psubw m6, P0, Q0 ; p0-q0 707 REPX {pabsw x, x}, m5, m6 708 paddw m6, m6 709 psrlw m5, 1 710 paddw m5, m6 ; abs(p0-q0)*2+(abs(p1-q1)>>1) 711 pcmpgtw m5, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E 712 por m3, m5 713 714%if %1 == 16 715 716%ifidn %2, v 717 lea tmpq, [dstq+mstrideq*8] 718 mova m0, [tmpq+strideq*1] 719 mova m1, [tmpq+strideq*2] 720 mova m2, [tmpq+stride3q] 721%else ; %2 != v 722 mova m0, [rsp+ 8*16] 723 mova m1, [rsp+ 9*16] 724 mova m2, [rsp+10*16] 725%endif ; %2==/!=v 726 REPX {psubw x, P0}, m0, m1, m2 727 REPX {pabsw x, x}, m0, m1, m2 728 pmaxsw m1, m0 729 pmaxsw m1, m2 730%ifidn %2, v 731 lea tmpq, [dstq+strideq*4] 732 mova m0, [tmpq+strideq*0] 733 mova m2, [tmpq+strideq*1] 734 mova m5, [tmpq+strideq*2] 735%else ; %2 != v 736 mova m0, [rsp+11*16] 737 mova m2, [rsp+12*16] 738 mova m5, [rsp+13*16] 739%endif ; %2==/!=v 740 REPX {psubw x, Q0}, m0, m2, m5 741 REPX {pabsw x, x}, m0, m2, m5 742 pmaxsw m0, m2 743 pmaxsw m1, m5 744 pmaxsw m1, m0 745 pcmpgtw m1, [bdmulq] ; !flat8out 746 por m1, m4 ; !flat8in | !flat8out 747 SPLATD m2, [maskq+8] 748 pand m5, m2, m12 749 pcmpeqd m5, m12 750 pandn m1, m5 ; flat16 751 pandn m5, m3, m1 ; flat16 & fm 752 SWAP 1, 5 753 754 SPLATD m5, [maskq+4] 755 por m5, m2 756 pand m2, m5, m12 757 pcmpeqd m2, m12 758 pandn m4, m2 ; flat8in 759 pandn m2, m3, m4 760 SWAP 2, 4 761 SPLATD m2, [maskq+0] 762 por m2, m5 763 pand m2, m12 764 pcmpeqd m2, m12 765 pandn m3, m2 766 pandn m0, m4, m3 ; fm & !flat8 & !flat16 767 SWAP 0, 3 768 pandn m0, m1, m4 ; flat8 & !flat16 769 SWAP 0, 4 770%elif %1 != 4 771 SPLATD m0, [maskq+4] 772 pand m2, m0, m12 773 pcmpeqd m2, m12 774 pandn m4, m2 775 pandn m2, m3, m4 ; flat8 & fm 776 SWAP 2, 4 777 SPLATD m2, [maskq+0] 778 por m0, m2 779 pand m0, m12 780 pcmpeqd m0, m12 781 pandn m3, m0 782 pandn m0, m4, m3 ; fm & !flat8 783 SWAP 0, 3 784%else ; %1 == 4 785 SPLATD m0, [maskq+0] 786 pand m0, m12 787 pcmpeqd m0, m12 788 pandn m3, m0 ; fm 789%endif ; %1==/!=4 790 791 ; short filter 792%if ARCH_X86_64 793 SPLATW m0, r7m 794%else 795 SPLATW m0, bdmulm 796%endif 797 pcmpeqw m2, m2 798 psrlw m0, 1 ; 511 or 2047 799 pxor m2, m0 ; -512 or -2048 800 801 psubw m5, Q0, P0 ; q0-p0 802 paddw m6, m5, m5 803 paddw m6, m5 ; 3*(q0-p0) 804 psubw m5, P1, Q1 ; iclip_diff(p1-q1) 805 pminsw m5, m0 806 pmaxsw m5, m2 807 pand m5, m7 ; f=iclip_diff(p1-q1)&hev 808 paddw m5, m6 ; f=iclip_diff(3*(q0-p0)+f) 809 pminsw m5, m0 810 pmaxsw m5, m2 811 pand m3, m5 ; f&=fm 812 paddw m5, m3, [PIC_sym(pw_3)] 813 paddw m3, [PIC_sym(pw_4)] 814 REPX {pminsw x, m0}, m5, m3 815 psraw m5, 3 ; f2 816 psraw m3, 3 ; f1 817 psubw m0, m2 ; 1023 or 4095 818 pxor m2, m2 819%if ARCH_X86_64 820 paddw P0, m5 821 psubw Q0, m3 822%else 823 paddw m5, P0 824 psubw m6, Q0, m3 825 REPX {pminsw x, m0}, m5, m6 826 REPX {pmaxsw x, m2}, m5, m6 827%endif 828 829 paddw m3, [PIC_sym(pw_1)] 830 psraw m3, 1 ; f=(f1+1)>>1 831 pandn m7, m3 ; f&=!hev 832 SWAP 7, 3 833%if ARCH_X86_64 834 paddw P1, m3 835 psubw Q1, m3 836 REPX {pminsw x, m0}, P1, P0, Q0, Q1 837 REPX {pmaxsw x, m2}, P1, P0, Q0, Q1 838%else 839 psubw m7, Q1, m3 840 paddw m3, P1 841 REPX {pminsw x, m0}, m7, m3 842 REPX {pmaxsw x, m2}, m7, m3 843%if %1 > 4 844 mova P1, m3 845 mova P0, m5 846 mova Q0, m6 847 mova Q1, m7 848%endif 849%endif 850 851%if %1 == 16 852 853; m8-11 = p1/p0/q0/q1, m4=flat8, m1=flat16 854; m12=filter bits mask 855; m13-15=p2/q2/q3 856; m0,2-3,5-7 = free 857 858 ; flat16 filter 859%ifidn %2, v 860 lea tmpq, [dstq+mstrideq*8] 861 mova m0, [tmpq+strideq*1] ; p6 862 mova m2, [tmpq+strideq*2] ; p5 863 mova m7, [tmpq+stride3q] ; p4 864 mova m6, [tmpq+strideq*4] ; p3 865 lea tmpq, [dstq+mstrideq*4] 866%else ; %2 != v 867 mova m0, [rsp+ 8*16] 868 mova m2, [rsp+ 9*16] 869 mova m7, [rsp+10*16] 870 mova m6, [rsp+ 6*16] 871%endif ; %2==/!=v 872 873 mova [rsp+ 0*16], m4 874 875 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 876 psllw m3, m0, 3 ; p6*8 877 paddw m3, [PIC_sym(pw_8)] 878 paddw m5, m2, m7 ; p5+p4 879 psubw m3, m0 880 paddw m5, m5 ; (p5+p4)*2 881 paddw m3, m6 ; p6*7+p3 882 paddw m5, P2 ; (p5+p4)*2+p2 883 paddw m3, P1 ; p6*7+p3+p1 884 paddw m5, P0 ; (p5+p4)*2+p2+p0 885 paddw m3, Q0 ; p6*7+p3+p1+q0 886 paddw m3, m5 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 887 psrlw m5, m3, 4 888 pand m5, m1 889 pandn m4, m1, m2 890 por m5, m4 891%ifidn %2, v 892 mova [tmpq+mstrideq*2], m5 ; p5 893%else ; %2 != v 894 mova [rsp+9*16], m5 895%endif ; %2==/!=v 896 897 ; sub p6*2, add p3/q1 898 paddw m3, m6 899 paddw m5, m0, m0 900 paddw m3, Q1 901 psubw m3, m5 902 psrlw m5, m3, 4 903 pand m5, m1 904 pandn m4, m1, m7 905 por m5, m4 906%ifidn %2, v 907 mova [tmpq+mstrideq*1], m5 ; p4 908%else ; %2 != v 909 mova [rsp+10*16], m5 910%endif ; %2==/!=v 911 912 ; sub p6/p5, add p2/q2 913 psubw m3, m0 914 paddw m5, P2, Q2 915 psubw m3, m2 916 paddw m3, m5 917 psrlw m5, m3, 4 918 pand m5, m1 919 pandn m4, m1, m6 920 por m5, m4 921%ifidn %2, v 922 mova [tmpq+strideq*0], m5 ; p3 923%else ; %2 != v 924 mova [rsp+6*16], m5 925%endif ; %2==/!=v 926 927%define WRITE_IN_PLACE 0 928%ifidn %2, v 929%if ARCH_X86_64 930%define WRITE_IN_PLACE 1 931%endif 932%endif 933 934 ; sub p6/p4, add p1/q3 935 paddw m3, P1 936 paddw m5, m0, m7 937 paddw m3, Q3 938 psubw m3, m5 939 psrlw m5, m3, 4 940 pand m5, m1 941 pandn m4, m1, P2 942 por m5, m4 943%if WRITE_IN_PLACE 944 mova [tmpq+strideq*1], m5 945%else 946 mova [rsp+1*16], m5 ; don't clobber p2/m13 947%endif 948 949 ; sub p6/p3, add p0/q4 950 paddw m3, P0 951 paddw m5, m0, m6 952%ifidn %2, v 953 paddw m3, [dstq+strideq*4] 954%else ; %2 != v 955 paddw m3, [rsp+11*16] 956%endif ; %2==/!=v 957 psubw m3, m5 958 psrlw m5, m3, 4 959 pand m5, m1 960 pandn m4, m1, P1 961 por m5, m4 962%if WRITE_IN_PLACE 963 mova [dstq+mstrideq*2], m5 964%else 965 mova [rsp+2*16], m5 ; don't clobber p1/m3 966%endif 967 968 ; sub p6/p2, add q0/q5 969 paddw m3, Q0 970 paddw m5, m0, P2 971%ifidn %2, v 972%if ARCH_X86_32 973 lea r4, P2 974%endif 975 lea tmpq, [dstq+strideq*4] 976 paddw m3, [tmpq+strideq*1] 977%else ; %2 != v 978 paddw m3, [rsp+12*16] 979%endif ; %2==/!=v 980 psubw m3, m5 981 psrlw m5, m3, 4 982 pand m5, m1 983 pandn m4, m1, P0 984 por m5, m4 985%if WRITE_IN_PLACE 986 mova [dstq+mstrideq*1], m5 987%else 988 mova [rsp+3*16], m5 ; don't clobber p0/m4 989%endif 990 991 ; sub p6/p1, add q1/q6 992 paddw m3, Q1 993 paddw m5, m0, P1 994%ifidn %2, v 995 mova m0, [tmpq+strideq*2] ; q6 996%else ; %2 != v 997 mova m0, [rsp+13*16] ; q6 998%endif ; %2==/!=v 999 paddw m3, m0 1000 psubw m3, m5 1001 psrlw m5, m3, 4 1002 pand m5, m1 1003 pandn m4, m1, Q0 1004 por m5, m4 1005%if WRITE_IN_PLACE 1006 mova [dstq], m5 1007%else 1008 mova [rsp+4*16], m5 ; don't clobber q0/m5 1009%endif 1010 1011 ; sub p5/p0, add q2/q6 1012 paddw m3, Q2 1013 paddw m5, m2, P0 1014 paddw m3, m0 1015 psubw m3, m5 1016 psrlw m5, m3, 4 1017 pand m5, m1 1018 pandn m4, m1, Q1 1019 por m2, m5, m4 ; don't clobber q1/m6 1020 1021 ; sub p4/q0, add q3/q6 1022 paddw m3, Q3 1023 paddw m7, Q0 1024 paddw m3, m0 1025 psubw m3, m7 1026 psrlw m7, m3, 4 1027 pand m7, m1 1028 pandn m4, m1, Q2 1029 por m7, m4 ; don't clobber q2/m14 1030 1031 ; sub p3/q1, add q4/q6 1032%ifidn %2, v 1033 paddw m3, [tmpq+strideq*0] 1034%else ; %2 != v 1035 paddw m3, [rsp+11*16] 1036%endif ; %2==/!=v 1037 paddw m6, Q1 1038 paddw m3, m0 1039 psubw m3, m6 1040 psrlw m6, m3, 4 1041 pand m6, m1 1042 pandn m4, m1, Q3 1043 por m6, m4 1044%if WRITE_IN_PLACE 1045 mova [tmpq+mstrideq], m6 ; q3 1046%else ; %2 != v 1047 mova [rsp+5*16], m6 1048%endif ; %2==/!=v 1049 1050 ; sub p2/q2, add q5/q6 1051%ifidn %2, v 1052 paddw m3, [tmpq+strideq*1] 1053%if ARCH_X86_64 1054 paddw m5, P2, Q2 1055%else 1056 ; because tmpq is clobbered, so we use a backup pointer for P2 instead 1057 paddw m5, [r4], Q2 1058 mov pic_regq, pic_regm 1059%endif 1060%else ; %2 != v 1061 paddw m3, [rsp+12*16] 1062 paddw m5, P2, Q2 1063%endif ; %2==/!=v 1064 paddw m3, m0 1065 psubw m3, m5 1066 psrlw m5, m3, 4 1067 pand m5, m1 1068%ifidn %2, v 1069 pandn m4, m1, [tmpq+strideq*0] 1070%else ; %2 != v 1071 pandn m4, m1, [rsp+11*16] 1072%endif ; %2==/!=v 1073 por m5, m4 1074%ifidn %2, v 1075 mova [tmpq+strideq*0], m5 ; q4 1076%else ; %2 != v 1077 mova [rsp+11*16], m5 1078%endif ; %2==/!=v 1079 1080 ; sub p1/q3, add q6*2 1081 psubw m3, P1 1082 paddw m0, m0 1083 psubw m3, Q3 1084 paddw m3, m0 1085 psrlw m5, m3, 4 1086 pand m5, m1 1087%ifidn %2, v 1088 pandn m4, m1, [tmpq+strideq*1] 1089%else ; %2 != v 1090 pandn m4, m1, [rsp+12*16] 1091%endif ; %2==/!=v 1092 por m5, m4 1093%ifidn %2, v 1094 mova [tmpq+strideq*1], m5 ; q5 1095%else ; %2 != v 1096 mova [rsp+12*16], m5 1097%endif ; %2==/!=v 1098 1099 mova m4, [rsp+0*16] 1100%ifidn %2, v 1101 lea tmpq, [dstq+mstrideq*4] 1102%endif 1103%if ARCH_X86_64 1104 SWAP 2, 11 1105 SWAP 7, 14 1106 SWAP 6, 15 1107%else ; x86-32 1108 mova Q1, m2 1109 mova Q2, m7 1110%endif ; x86-32/64 1111%if WRITE_IN_PLACE 1112 mova P2, [tmpq+strideq*1] 1113 mova P1, [tmpq+strideq*2] 1114 mova P0, [tmpq+stride3q] 1115 mova Q0, [dstq] 1116%elif ARCH_X86_64 1117 mova P2, [rsp+1*16] 1118 mova P1, [rsp+2*16] 1119 mova P0, [rsp+3*16] 1120 mova Q0, [rsp+4*16] 1121%else ; !WRITE_IN_PLACE & x86-32 1122 mova m0, [rsp+1*16] 1123 mova m1, [rsp+2*16] 1124 mova m2, [rsp+3*16] 1125 mova m3, [rsp+4*16] 1126 mova m7, [rsp+5*16] 1127 mova P2, m0 1128 mova P1, m1 1129 mova P0, m2 1130 mova Q0, m3 1131 mova Q3, m7 1132%endif ; WRITE_IN_PLACE / x86-32/64 1133%undef WRITE_IN_PLACE 1134%endif ; %1 == 16 1135 1136%if %1 >= 8 1137 1138 ; flat8 filter 1139 mova m0, P3 ; p3 1140 paddw m1, m0, P2 ; p3+p2 1141 paddw m2, P1, P0 ; p1+p0 1142 paddw m3, m1, m1 ; 2*(p3+p2) 1143 paddw m2, m0 ; p1+p0+p3 1144 paddw m3, Q0 ; 2*(p3+p2)+q0 1145 paddw m2, m3 ; 3*p3+2*p2+p1+p0+q0 1146 pmulhrsw m7, m2, [PIC_sym(pw_4096)] 1147 psubw m7, P2 1148 pand m7, m4 1149 1150 paddw m3, P1, Q1 ; p1+q1 1151 psubw m2, m1 ; 2*p3+p2+p1+p0+q0 1152 paddw m2, m3 ; 2*p3+p2+2*p1+p0+q0+q1 1153 pmulhrsw m3, m2, [PIC_sym(pw_4096)] 1154 psubw m3, P1 1155 pand m3, m4 1156 1157 paddw m5, m0, P1 ; p3+p1 1158 paddw m6, P0, Q2 ; p0+q2 1159 psubw m2, m5 ; p3+p2+p1+p0+q0+q1 1160 paddw m2, m6 ; p3+p2+p1+2*p0+q0+q1+q2 1161 pmulhrsw m5, m2, [PIC_sym(pw_4096)] 1162 psubw m5, P0 1163 pand m5, m4 1164 1165 paddw m6, m0, P0 ; p3+p0 1166 paddw m1, Q0, Q3 ; q0+q3 1167 psubw m2, m6 ; p2+p1+p0+q0+q1+q2 1168 paddw m2, m1 ; p2+p1+p0+2*q0+q1+q2+q3 1169 pmulhrsw m6, m2, [PIC_sym(pw_4096)] 1170 psubw m6, Q0 1171 pand m6, m4 1172 1173 paddw m2, Q1 ; p2+p1+p0+2*q0+2*q1+q2+q3 1174 paddw m2, Q3 ; p2+p1+p0+2*q0+2*q1+q2+2*q3 1175 paddw m1, P2, Q0 ; p2+q0 1176 psubw m2, m1 ; p1+p0+q0+2*q1+q2+2*q3 1177 pmulhrsw m1, m2, [PIC_sym(pw_4096)] 1178 psubw m1, Q1 1179 pand m1, m4 1180 1181 psubw m2, P1 ; p0+q0+2*q1+q2+2*q3 1182 psubw m2, Q1 ; p0+q0+q1+q2+2*q3 1183 paddw m0, Q3, Q2 ; q3+q2 1184 paddw m2, m0 ; p0+q0+q1+2*q2+3*q3 1185 pmulhrsw m2, [PIC_sym(pw_4096)] 1186 psubw m2, Q2 1187 pand m2, m4 1188 1189 paddw m7, P2 1190 paddw m3, P1 1191 paddw m5, P0 1192 paddw m6, Q0 1193 paddw m1, Q1 1194 paddw m2, Q2 1195 1196%ifidn %2, v 1197 mova [tmpq+strideq*1], m7 ; p2 1198 mova [tmpq+strideq*2], m3 ; p1 1199 mova [tmpq+stride3q ], m5 ; p0 1200 mova [dstq+strideq*0], m6 ; q0 1201 mova [dstq+strideq*1], m1 ; q1 1202 mova [dstq+strideq*2], m2 ; q2 1203%else ; %2 != v 1204 mova m0, P3 1205 1206%if %1 == 8 1207 lea tmpq, [dstq+strideq*4] 1208%if ARCH_X86_64 1209 SWAP 4, 15 1210 TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, 8 1211%else 1212 TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, "", \ 1213 Q3, [tmpq+strideq*1-8], a, u 1214%endif 1215 1216 ; write 8x8 1217 movu [dstq+strideq*0-8], m0 1218 movu [dstq+strideq*1-8], m7 1219 movu [dstq+strideq*2-8], m3 1220 movu [dstq+stride3q -8], m5 1221 movu [tmpq+strideq*0-8], m6 1222%if ARCH_X86_64 1223 movu [tmpq+strideq*1-8], m1 1224%endif 1225 movu [tmpq+strideq*2-8], m2 1226 movu [tmpq+stride3q -8], m4 1227 lea dstq, [dstq+strideq*8] 1228%else ; %1 != 8 1229%if ARCH_X86_64 1230 SWAP 6, 8 1231 SWAP 1, 9 1232 SWAP 2, 10 1233%else 1234 mova [rsp+1*16], m6 1235 mova [rsp+2*16], m1 1236 mova [rsp+3*16], m2 1237%endif 1238 1239 mova m1, [rsp+ 7*16] 1240 mova m2, [rsp+ 8*16] 1241 mova m4, [rsp+ 9*16] 1242 mova m6, [rsp+10*16] 1243 lea tmpq, [dstq+strideq*4] 1244%if ARCH_X86_64 1245 TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, 11 1246%else 1247 mova [rsp+7*16], m5 1248 TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, "", \ 1249 [rsp+7*16], [tmpq+strideq*1-16], a, a 1250%endif 1251 1252 mova [dstq+strideq*0-16], m1 1253 mova [dstq+strideq*1-16], m2 1254 mova [dstq+strideq*2-16], m4 1255 mova [dstq+stride3q -16], m6 1256 mova [tmpq+strideq*0-16], m0 1257%if ARCH_X86_64 1258 mova [tmpq+strideq*1-16], m7 1259%endif 1260 mova [tmpq+strideq*2-16], m3 1261 mova [tmpq+stride3q -16], m5 1262 1263%if ARCH_X86_64 1264 SWAP 6, 8 1265 SWAP 1, 9 1266 SWAP 2, 10 1267 SWAP 4, 15 1268%else 1269 mova m6, [rsp+1*16] 1270 mova m1, [rsp+2*16] 1271 mova m2, [rsp+3*16] 1272 mova m4, Q3 1273%endif 1274 mova m0, [rsp+11*16] 1275 mova m3, [rsp+12*16] 1276 mova m5, [rsp+13*16] 1277%if ARCH_X86_64 1278 mova m7, [rsp+14*16] 1279 TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, 8 1280%else 1281 TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, "", \ 1282 [rsp+14*16], [tmpq+strideq*1], a, a 1283%endif 1284 mova [dstq+strideq*0], m6 1285 mova [dstq+strideq*1], m1 1286 mova [dstq+strideq*2], m2 1287 mova [dstq+stride3q ], m4 1288 mova [tmpq+strideq*0], m0 1289%if ARCH_X86_64 1290 mova [tmpq+strideq*1], m3 1291%endif 1292 mova [tmpq+strideq*2], m5 1293 mova [tmpq+stride3q ], m7 1294 lea dstq, [dstq+strideq*8] 1295%endif ; %1==/!=8 1296%endif ; %2==/!=v 1297%elif %1 == 6 1298 ; flat6 filter 1299 paddw m3, P1, P0 ; p1+p0 1300 paddw m3, P2 ; p2+p1+p0 1301 paddw m6, P2, Q0 ; p2+q0 1302 paddw m3, m3 ; 2*(p2+p1+p0) 1303 paddw m3, m6 ; p2+2*(p2+p1+p0)+q0 1304 pmulhrsw m2, m3, [PIC_sym(pw_4096)] 1305 psubw m2, P1 1306 pand m2, m4 1307 1308 paddw m3, Q0 ; p2+2*(p2+p1+p0+q0) 1309 paddw m6, P2, P2 ; 2*p2 1310 paddw m3, Q1 ; p2+2*(p2+p1+p0+q0)+q1 1311 psubw m3, m6 ; p2+2*(p1+p0+q0)+q1 1312 pmulhrsw m5, m3, [PIC_sym(pw_4096)] 1313 psubw m5, P0 1314 pand m5, m4 1315 1316 paddw m3, Q1 ; p2+2*(p1+p0+q0+q1) 1317 paddw m6, P2, P1 ; p2+p1 1318 paddw m3, Q2 ; p2+2*(p1+p0+q0+q1)+q2 1319 psubw m3, m6 ; p1+2*(p0+q0+q1)+q2 1320 pmulhrsw m6, m3, [PIC_sym(pw_4096)] 1321 psubw m6, Q0 1322 pand m6, m4 1323 1324 psubw m3, P1 ; 2*(p0+q0+q1)+q2 1325%if ARCH_X86_64 1326 paddw Q2, Q2 ; q2*2 1327%else 1328 mova m0, Q2 1329 paddw m0, m0 1330%endif 1331 psubw m3, P0 ; p0+2*(q0+q1)+q2 1332%if ARCH_X86_64 1333 paddw m3, Q2 ; p0+q*(q0+q1+q2)+q2 1334%else 1335 paddw m3, m0 1336%endif 1337 pmulhrsw m3, [PIC_sym(pw_4096)] 1338 psubw m3, Q1 1339 pand m3, m4 1340 1341 paddw m2, P1 1342 paddw m5, P0 1343 paddw m6, Q0 1344 paddw m3, Q1 1345 1346%ifidn %2, v 1347 mova [dstq+mstrideq*2], m2 ; p1 1348 mova [dstq+mstrideq*1], m5 ; p0 1349 mova [dstq+strideq*0], m6 ; q0 1350 mova [dstq+strideq*1], m3 ; q1 1351%else ; %2 != v 1352 TRANSPOSE_8x4_AND_WRITE_4x8 m2, m5, m6, m3, m0 1353%endif ; %2==/!=v 1354%else ; %1 == 4 1355%if ARCH_X86_64 1356%ifidn %2, v 1357 mova [dstq+mstrideq*2], P1 ; p1 1358 mova [dstq+mstrideq*1], P0 ; p0 1359 mova [dstq+strideq*0], Q0 ; q0 1360 mova [dstq+strideq*1], Q1 ; q1 1361%else ; %2 != v 1362 TRANSPOSE_8x4_AND_WRITE_4x8 P1, P0, Q0, Q1, m0 1363%endif ; %2==/!=v 1364%else ; x86-32 1365%ifidn %2, v 1366 mova [dstq+mstrideq*2], m3 1367 mova [dstq+mstrideq*1], m5 1368 mova [dstq+strideq*0], m6 1369 mova [dstq+strideq*1], m7 1370%else ; %2 != v 1371 TRANSPOSE_8x4_AND_WRITE_4x8 m3, m5, m6, m7, m0 1372%endif ; %2==/!=v 1373%endif ; x86-32/64 1374%endif ; %1 1375%undef P3 1376%undef P2 1377%undef P1 1378%undef P0 1379%undef Q0 1380%undef Q1 1381%undef Q2 1382%undef Q3 1383%endmacro 1384 1385INIT_XMM ssse3 1386; stack layout: 1387; r0 - flat8 backup inside flat16 code 1388%if ARCH_X86_64 1389cglobal lpf_v_sb_y_16bpc, 6, 12, 16, -16 * 1, \ 1390 dst, stride, mask, l, l_stride, lut, \ 1391 w, stride3, mstride, tmp, mask_bits, bdmul 1392 mov r6d, r7m 1393 sar r6d, 7 1394 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc 1395 lea bdmulq, [pw_4] 1396 add bdmulq, r6 1397 mov wd, wm 1398 shl l_strideq, 2 1399 sub lq, l_strideq 1400%else 1401; stack layout [32bit only]: 1402; r1-4 - p2-q0 post-filter16 1403; r5 - p3 1404; r6 - q3 post-filter16 1405; r7 - GPRs [mask_bitsm, mstridem] 1406; r8 - m12/pb_mask 1407; r9 - bdmulq 1408cglobal lpf_v_sb_y_16bpc, 4, 7, 8, -16 * (10 + extra_stack), \ 1409 dst, stride, mask, mstride, pic_reg, stride3, tmp 1410 RELOC_ARGS v, 10*16 1411%if STACK_ALIGNMENT >= 16 1412 mov r5d, r7m 1413%endif 1414 sar r5d, 7 1415 and r5d, 16 ; 0 for 10bpc, 16 for 12bpc 1416 LEA pic_regq, PIC_base 1417%define pic_regm dword [esp+7*16+2*gprsize] 1418 mov pic_regm, pic_regq 1419 mova m0, [PIC_sym(pw_4)+r5] 1420%define bdmulq esp+9*16 1421 mova [bdmulq], m0 1422 shl dword lstridem, 2 1423 sub r3, dword lstridem 1424 mov dword lm, r3 1425%endif 1426 mov mstrideq, strideq 1427 neg mstrideq 1428 lea stride3q, [strideq*3] 1429%if ARCH_X86_64 1430 mov mask_bitsd, 0x3 1431 mova m12, [pb_mask] 1432%else 1433%define mstridem dword [esp+7*16+1*gprsize] 1434 mov mstridem, mstrideq 1435%define mask_bitsm dword [esp+7*16+0*gprsize] 1436 mov mask_bitsm, 0x3 1437 mova m0, [PIC_sym(pb_mask)] 1438%define m12 [esp+8*16] 1439 mova m12, m0 1440%endif 1441 1442.loop: 1443%if ARCH_X86_64 1444 test [maskq+8], mask_bitsd ; vmask[2] 1445%else 1446 mov r6d, mask_bitsm 1447 test [maskq+8], r6d 1448%endif 1449 jz .no_flat16 1450 1451 FILTER 16, v 1452 jmp .end 1453 1454.no_flat16: 1455%if ARCH_X86_64 1456 test [maskq+4], mask_bitsd ; vmask[1] 1457%else 1458 test [maskq+4], r6d 1459%endif 1460 jz .no_flat 1461 1462 FILTER 8, v 1463 jmp .end 1464 1465.no_flat: 1466%if ARCH_X86_64 1467 test [maskq+0], mask_bitsd ; vmask[0] 1468%else 1469 test [maskq+0], r6d 1470%endif 1471 jz .end 1472 1473 FILTER 4, v 1474 1475.end: 1476%if ARCH_X86_64 1477 pslld m12, 2 1478 add lq, 8 1479%else 1480 mova m0, m12 1481 pslld m0, 2 1482 mova m12, m0 1483 add dword lm, 8 1484%endif 1485 add dstq, 16 1486%if ARCH_X86_64 1487 shl mask_bitsd, 2 1488 sub wd, 2 1489%else 1490 shl mask_bitsm, 2 1491 sub dword wm, 2 1492%endif 1493 jg .loop 1494%undef mask_bitsm 1495%undef bdmulq 1496 UNRELOC_ARGS 1497 RET 1498 1499INIT_XMM ssse3 1500; stack layout: 1501; r0 - flat8 backup inside flat16 1502; r1-4 - p2-q0 post-filter16 backup 1503; r5 - q3 post-filter16 backup 1504; r6 - p3 1505; r7-10 - p7-4 1506; r11-14 - q4-7 1507%if ARCH_X86_64 1508cglobal lpf_h_sb_y_16bpc, 6, 11, 16, -16 * 15, \ 1509 dst, stride, mask, l, l_stride, lut, \ 1510 h, stride3, tmp, mask_bits, bdmul 1511 mov r6d, r7m 1512 sar r6d, 7 1513 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc 1514 lea bdmulq, [pw_4] 1515 add bdmulq, r6 1516 mov hd, hm 1517 shl l_strideq, 2 1518%else 1519; stack layout [32bit only]: 1520; r15 - GPRs [mask_bitsm] 1521; r16 - m12/pb_mask 1522; r17 - bdmulq 1523; r18-24 - p2-q3 1524cglobal lpf_h_sb_y_16bpc, 4, 7, 8, -16 * (25 + extra_stack), \ 1525 dst, stride, mask, l, pic_reg, stride3, tmp 1526 RELOC_ARGS h, 25*16 1527%if STACK_ALIGNMENT >= 16 1528 mov r5d, r7m 1529%endif 1530 sar r5d, 7 1531 and r5d, 16 ; 0 for 10bpc, 16 for 12bpc 1532 LEA pic_regq, PIC_base 1533 mova m0, [PIC_sym(pw_4)+r5] 1534%define bdmulq esp+17*16 1535 mova [bdmulq], m0 1536 shl dword lstridem, 2 1537%endif 1538 sub lq, 4 1539 lea stride3q, [strideq*3] 1540%if ARCH_X86_64 1541 mov mask_bitsd, 0x3 1542 mova m12, [pb_mask] 1543%else 1544%define mask_bitsm dword [esp+15*16+0*gprsize] 1545 mov mask_bitsm, 0x3 1546 mova m0, [PIC_sym(pb_mask)] 1547%define m12 [esp+16*16] 1548 mova m12, m0 1549%endif 1550 1551.loop: 1552%if ARCH_X86_64 1553 test [maskq+8], mask_bitsd ; vmask[2] 1554%else 1555 mov r6d, mask_bitsm 1556 test [maskq+8], r6d 1557%endif 1558 jz .no_flat16 1559 1560 FILTER 16, h 1561 jmp .end 1562 1563.no_flat16: 1564%if ARCH_X86_64 1565 test [maskq+4], mask_bitsd ; vmask[1] 1566%else 1567 test [maskq+4], r6d 1568%endif 1569 jz .no_flat 1570 1571 FILTER 8, h 1572 jmp .end 1573 1574.no_flat: 1575%if ARCH_X86_64 1576 test [maskq+0], mask_bitsd ; vmask[0] 1577%else 1578 test [maskq+0], r6d 1579%endif 1580 jz .no_filter 1581 1582 FILTER 4, h 1583 jmp .end 1584 1585.no_filter: 1586 lea dstq, [dstq+strideq*8] 1587.end: 1588%if ARCH_X86_64 1589 pslld m12, 2 1590 lea lq, [lq+l_strideq*2] 1591 shl mask_bitsd, 2 1592 sub hd, 2 1593%else 1594 mova m0, m12 1595 pslld m0, 2 1596 mova m12, m0 1597 add lq, dword lstridem 1598 add lq, dword lstridem 1599 shl mask_bitsm, 2 1600 sub dword hm, 2 1601%endif 1602 jg .loop 1603%undef mask_bitsm 1604%undef bdmulq 1605 UNRELOC_ARGS 1606 RET 1607 1608INIT_XMM ssse3 1609%if ARCH_X86_64 1610cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ 1611 dst, stride, mask, l, l_stride, lut, \ 1612 w, stride3, mstride, tmp, mask_bits, bdmul 1613 mov r6d, r7m 1614 sar r6d, 7 1615 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc 1616 lea bdmulq, [pw_4] 1617 add bdmulq, r6 1618 mov wd, wm 1619 shl l_strideq, 2 1620 sub lq, l_strideq 1621%else 1622; stack layout [32bit only]: 1623; r0 - GPRs [mask_bitsm, mstridem] 1624; r1 - m12/pb_mask 1625; r2 - bdmulq 1626cglobal lpf_v_sb_uv_16bpc, 4, 7, 8, -16 * (3 + extra_stack), \ 1627 dst, stride, mask, mstride, pic_reg, stride3, tmp 1628 RELOC_ARGS v, 3*16 1629%if STACK_ALIGNMENT >= 16 1630 mov r5d, r7m 1631%endif 1632 sar r5d, 7 1633 and r5d, 16 ; 0 for 10bpc, 16 for 12bpc 1634 LEA pic_regq, PIC_base 1635 mova m0, [PIC_sym(pw_4)+r5] 1636%define bdmulq esp+2*16 1637 mova [bdmulq], m0 1638 shl dword lstridem, 2 1639 sub r3, dword lstridem 1640 mov dword lm, r3 1641%endif 1642 mov mstrideq, strideq 1643 neg mstrideq 1644 lea stride3q, [strideq*3] 1645%if ARCH_X86_64 1646 mov mask_bitsd, 0x3 1647 mova m12, [pb_mask] 1648%else 1649%define mask_bitsm dword [esp+0*gprsize] 1650%define mstridem dword [esp+1*gprsize] 1651 mov mask_bitsm, 0x3 1652 mov mstridem, mstrideq 1653 mova m0, [PIC_sym(pb_mask)] 1654%define m12 [esp+1*16] 1655 mova m12, m0 1656%endif 1657 1658.loop: 1659%if ARCH_X86_64 1660 test [maskq+4], mask_bitsd ; vmask[1] 1661%else 1662 mov r6d, mask_bitsm 1663 test [maskq+4], r6d 1664%endif 1665 jz .no_flat 1666 1667 FILTER 6, v 1668 jmp .end 1669 1670.no_flat: 1671%if ARCH_X86_64 1672 test [maskq+0], mask_bitsd ; vmask[0] 1673%else 1674 test [maskq+0], r6d 1675%endif 1676 jz .end 1677 1678 FILTER 4, v 1679 1680.end: 1681%if ARCH_X86_64 1682 pslld m12, 2 1683 add lq, 8 1684%else 1685 mova m0, m12 1686 pslld m0, 2 1687 mova m12, m0 1688 add dword lm, 8 1689%endif 1690 add dstq, 16 1691%if ARCH_X86_64 1692 shl mask_bitsd, 2 1693 sub wd, 2 1694%else 1695 shl mask_bitsm, 2 1696 sub dword wm, 2 1697%endif 1698 jg .loop 1699%undef mask_bitsm 1700%undef bdmulq 1701 UNRELOC_ARGS 1702 RET 1703 1704INIT_XMM ssse3 1705%if ARCH_X86_64 1706cglobal lpf_h_sb_uv_16bpc, 6, 11, 16, \ 1707 dst, stride, mask, l, l_stride, lut, \ 1708 h, stride3, tmp, mask_bits, bdmul 1709 mov r6d, r7m 1710 sar r6d, 7 1711 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc 1712 lea bdmulq, [pw_4] 1713 add bdmulq, r6 1714 mov hd, hm 1715 shl l_strideq, 2 1716%else 1717; stack layout [32bit only]: 1718; r0 - GPRs [mask_bitsm] 1719; r1 - m12/pb_mask 1720; r2 - bdmulq 1721; r3-8 - p2-q2 1722cglobal lpf_h_sb_uv_16bpc, 4, 7, 8, -16 * (9 + extra_stack), \ 1723 dst, stride, mask, l, pic_reg, stride3, tmp 1724 RELOC_ARGS h, 9*16 1725%if STACK_ALIGNMENT >= 16 1726 mov r5d, r7m 1727%endif 1728 sar r5d, 7 1729 and r5d, 16 ; 0 for 10bpc, 16 for 12bpc 1730 LEA pic_regq, PIC_base 1731 mova m0, [PIC_sym(pw_4)+r5] 1732%define bdmulq esp+2*16 1733 mova [bdmulq], m0 1734 shl dword lstridem, 2 1735%endif 1736 sub lq, 4 1737 lea stride3q, [strideq*3] 1738%if ARCH_X86_64 1739 mov mask_bitsd, 0x3 1740 mova m12, [pb_mask] 1741%else 1742%define mask_bitsm dword [esp+0*gprsize] 1743 mov mask_bitsm, 0x3 1744 mova m0, [PIC_sym(pb_mask)] 1745%define m12 [esp+1*16] 1746 mova m12, m0 1747%endif 1748 1749.loop: 1750%if ARCH_X86_64 1751 test [maskq+4], mask_bitsd ; vmask[1] 1752%else 1753 mov r6d, mask_bitsm 1754 test [maskq+4], r6d 1755%endif 1756 jz .no_flat 1757 1758 FILTER 6, h 1759 jmp .end 1760 1761.no_flat: 1762%if ARCH_X86_64 1763 test [maskq+0], mask_bitsd ; vmask[0] 1764%else 1765 test [maskq+0], r6d 1766%endif 1767 jz .no_filter 1768 1769 FILTER 4, h 1770 jmp .end 1771 1772.no_filter: 1773 lea dstq, [dstq+strideq*8] 1774.end: 1775%if ARCH_X86_64 1776 pslld m12, 2 1777 lea lq, [lq+l_strideq*2] 1778 shl mask_bitsd, 2 1779 sub hd, 2 1780%else 1781 mova m0, m12 1782 pslld m0, 2 1783 mova m12, m0 1784 add lq, dword lstridem 1785 add lq, dword lstridem 1786 shl mask_bitsm, 2 1787 sub dword hm, 2 1788%endif 1789 jg .loop 1790%undef mask_bitsm 1791%undef bdmulq 1792 UNRELOC_ARGS 1793 RET 1794