1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 16 30 31%if ARCH_X86_64 32%define PIC_sym(a) a 33%else 34%define PIC_base $$ 35%define PIC_sym(a) pic_regq+a-PIC_base 36%endif 37 38pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 39 times 4 db 8, 9 40 41pw_1: times 8 dw 1 42pw_2: times 8 dw 2 43pw_3: times 8 dw 3 44; 4 and 16 need to be next to each other since they are used as alternates 45; depending on whether bitdepth is 10 or 12 46pw_4: times 8 dw 4 47pw_16: times 8 dw 16 48pw_8: times 8 dw 8 49pw_4096: times 8 dw 4096 50 51pb_mask: dd 1, 1, 2, 2 52 53SECTION .text 54 55%if ARCH_X86_32 56%if STACK_ALIGNMENT < 16 57%define extra_stack 2 58%else 59%define extra_stack 0 60%endif 61%endif 62 63%macro RELOC_ARGS 2 ; h/v, off 64ASSERT ARCH_X86_32 65%if STACK_ALIGNMENT < 16 66 mov r5d, [rstk + stack_offset + 4*4 + 4] 67%define lstridem [esp+%2+0*gprsize] 68 mov lstridem, r5d 69 mov r5d, [rstk + stack_offset + 4*5 + 4] 70%define lutm [esp+%2+1*gprsize] 71 mov lutm, r5d 72 mov r5d, [rstk + stack_offset + 4*6 + 4] 73%ifidn %1, v 74%define wm [esp+%2+2*gprsize] 75 mov wm, r5d 76 mov r5d, [rstk + stack_offset + 4*3 + 4] 77%define lm [esp+%2+3*gprsize] 78 mov lm, r5d 79%else ; %1 == h 80%define hm [esp+%2+2*gprsize] 81 mov hm, r5d 82%endif ; %1==v 83 mov r5d, r7m 84%define bdmulm [esp+%2+4*gprsize] 85 mov bdmulm, r5d 86%else 87%define lstridem r4m 88%define lutm r5m 89%ifidn %1, v 90%define wm r6m 91%define lm r3m 92%else 93%define hm r6m 94%endif 95%define bdmulm r7m 96%endif ; STACK_ALIGNMENT 97%endmacro 98 99%macro UNRELOC_ARGS 0 100%if ARCH_X86_32 101%undef lm 102%undef lstridem 103%undef wm 104%undef hm 105%undef lutm 106%endif 107%endmacro 108 109%macro REPX 2-* 110 %xdefine %%f(x) %1 111%rep %0 - 1 112 %rotate 1 113 %%f(%1) 114%endrep 115%endmacro 116 117%macro SPLATD 2 118 movd %1, %2 119 pshufd %1, %1, q0000 120%endmacro 121 122%macro SPLATW 2 123 movd %1, %2 124 pshuflw %1, %1, q0000 125 punpcklqdq %1, %1 126%endmacro 127 128; in: out: 129; mm%1 a b c d a e i m 130; mm%2 e f g h b f j n 131; mm%3 i j k l -> c g k o 132; mm%4 m n o p d h l p 133%macro TRANSPOSE4X4W 5 134 punpcklwd m%5, m%1, m%2 135 punpckhwd m%1, m%2 136 punpcklwd m%2, m%3, m%4 137 punpckhwd m%3, m%4 138 punpckldq m%4, m%5, m%2 139 punpckhdq m%5, m%2 140 punpckldq m%2, m%1, m%3 141 punpckhdq m%1, m%3 142 143 SWAP %1, %4 144 SWAP %2, %5, %3 145%endmacro 146 147; in: out: 148; m%1 a b c d e f g h a i q y 6 E M U 149; m%2 i j k l m n o p b j r z 7 F N V 150; m%3 q r s t u v w x c k s 0 8 G O W 151; m%4 y z 0 1 2 3 4 5 d l t 1 9 H P X 152; m%5 6 7 8 9 A B C D -> e m u 2 A I Q Y 153; m%6 E F G H I J K L f n v 3 B J R Z 154; m%7 M N O P Q R S T g o w 4 C K S + 155; m%8 U V W X Y Z + = h p x 5 D L T = 156%if ARCH_X86_64 157%macro TRANSPOSE8X8W 9 158 ; m%1 a b c d e f g h a i q y b j r z 159 ; m%2 i j k l m n o p c k s 0 d l t 1 160 ; m%3 q r s t u v w x -> e m u 2 f n v 3 161 ; m%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 162 TRANSPOSE4X4W %1, %2, %3, %4, %9 163 164 ; m%5 6 7 8 9 A B C D 6 E M U 7 F N V 165 ; m%6 E F G H I J K L 8 G O W 9 H P X 166 ; m%7 M N O P Q R S T -> A I Q Y B J R Z 167 ; m%8 U V W X Y Z + = C K S + D L T = 168 TRANSPOSE4X4W %5, %6, %7, %8, %9 169 170 ; m%1 a i q y b j r z a i q y 6 E M U 171 ; m%2 c k s 0 d l t 1 b j r z 7 F N V 172 ; m%3 e m u 2 f n v 3 c k s 0 8 G O W 173 ; m%4 g o w 4 h p x 5 d l t 1 9 H P X 174 ; m%5 6 E M U 7 F N V -> e m u 2 A I Q Y 175 ; m%6 8 G O W 9 H P X f n v 3 B J R Z 176 ; m%7 A I Q Y B J R Z g o w 4 C K S + 177 ; m%8 C K S + D L T = h p x 5 D L T = 178 punpckhqdq m%9, m%1, m%5 179 punpcklqdq m%1, m%5 180 punpckhqdq m%5, m%2, m%6 181 punpcklqdq m%2, m%6 182 punpckhqdq m%6, m%3, m%7 183 punpcklqdq m%3, m%7 184 punpckhqdq m%7, m%4, m%8 185 punpcklqdq m%4, m%8 186 187 SWAP %8, %7, %4, %5, %3, %2, %9 188%endmacro 189%else ; x86-32 190; input: 1-7 in registers, 8 in first memory [read-only] 191; second memory is scratch, and may overlap with first or third memory 192; output: 1-5,7-8 in registers, 6 in third memory [write-only] 193%macro TRANSPOSE8X8W 13 ; regs [8x], mem [3x], a/u [in/out alignment [2x] 194 TRANSPOSE4X4W %1, %2, %3, %4, %8 195%ifnidn %9, "" 196 mov%12 m%8, %9 197%else 198 mova m%8, %10 199%endif 200 mova %10, m%4 201 TRANSPOSE4X4W %5, %6, %7, %8, %4 202 punpckhqdq m%4, m%1, m%5 203 punpcklqdq m%1, m%5 204 punpckhqdq m%5, m%2, m%6 205 punpcklqdq m%2, m%6 206 punpckhqdq m%6, m%3, m%7 207 punpcklqdq m%3, m%7 208 mova m%7, %10 209%ifnidn %11, "" 210 mov%13 %11, m%6 211%else 212 mova %10, m%6 213%endif 214 punpckhqdq m%6, m%7, m%8 215 punpcklqdq m%7, m%8 216 217 ; 1,4,2,5,3,8,7,6 -> 1,2,3,4,5,6,7,8 218 SWAP %2, %4, %5, %3 219 SWAP %6, %8 220%endmacro 221%endif ; x86-32/64 222 223; transpose and write m8-11, everything else is scratch 224%macro TRANSPOSE_8x4_AND_WRITE_4x8 5 ; p1, p0, q0, q1, tmp 225 ; transpose 8x4 226 punpcklwd %5, %1, %2 227 punpckhwd %1, %2 228 punpcklwd %2, %3, %4 229 punpckhwd %3, %4 230 punpckldq %4, %5, %2 231 punpckhdq %5, %2 232 punpckldq %2, %1, %3 233 punpckhdq %1, %3 234 235 ; write out 236 movq [dstq+strideq*0-4], %4 237 movhps [dstq+strideq*1-4], %4 238 movq [dstq+strideq*2-4], %5 239 movhps [dstq+stride3q -4], %5 240 lea dstq, [dstq+strideq*4] 241 movq [dstq+strideq*0-4], %2 242 movhps [dstq+strideq*1-4], %2 243 movq [dstq+strideq*2-4], %1 244 movhps [dstq+stride3q -4], %1 245 lea dstq, [dstq+strideq*4] 246%endmacro 247 248%macro FILTER 2 ; width [4/6/8/16], dir [h/v] 249 ; load data 250%ifidn %2, v 251%if %1 == 4 252%if ARCH_X86_64 253%define P1 m8 254%define P0 m9 255%define Q0 m10 256%define Q1 m11 257 mova P1, [dstq+mstrideq*2] ; p1 258 mova P0, [dstq+mstrideq*1] ; p0 259 mova Q0, [dstq+strideq*0] ; q0 260 mova Q1, [dstq+strideq*1] ; q1 261%else ; x86-32 262%define P1 [dstq+mstrideq*2] 263%define P0 [dstq+mstrideq*1] 264%define Q0 [dstq+strideq*0] 265%define Q1 [dstq+strideq*1] 266%endif ; x86-32/64 267%else ; %1 != 4 268 ; load 6-8 pixels, remainder (for wd=16) will be read inline 269 lea tmpq, [dstq+mstrideq*4] 270%if ARCH_X86_64 271 ; we load p3 later 272%define P2 m13 273%define P1 m8 274%define P0 m9 275%define Q0 m10 276%define Q1 m11 277%define Q2 m14 278 mova P2, [tmpq+strideq*1] 279 mova P1, [tmpq+strideq*2] 280 mova P0, [tmpq+stride3q] 281 mova Q0, [dstq+strideq*0] 282 mova Q1, [dstq+strideq*1] 283 mova Q2, [dstq+strideq*2] 284%if %1 != 6 285%define P3 [tmpq+strideq*0] 286%define Q3 m15 287 mova Q3, [dstq+stride3q] 288%endif ; %1 != 6 289%else ; x86-32 290%define P2 [tmpq+strideq*1] 291%define P1 [dstq+mstrideq*2] 292%define P0 [dstq+mstrideq*1] 293%define Q0 [dstq+strideq*0] 294%define Q1 [dstq+strideq*1] 295%define Q2 [dstq+strideq*2] 296%if %1 != 6 297%define P3 [dstq+mstrideq*4] 298%define Q3 [dstq+stride3q] 299%endif ; %1 != 6 300%endif ; x86-32/64 301%endif ; %1 ==/!= 4 302%else ; %2 != v 303 ; load lines 304%if %1 == 4 305 movq m0, [dstq+strideq*0-4] 306 movq m2, [dstq+strideq*1-4] 307 movq m4, [dstq+strideq*2-4] 308 movq m5, [dstq+stride3q -4] 309 lea tmpq, [dstq+strideq*4] 310 movq m3, [tmpq+strideq*0-4] 311 movq m6, [tmpq+strideq*1-4] 312 movq m1, [tmpq+strideq*2-4] 313 movq m7, [tmpq+stride3q -4] 314 315 ; transpose 4x8 316 ; m0: A-D0 317 ; m2: A-D1 318 ; m4: A-D2 319 ; m5: A-D3 320 ; m3: A-D4 321 ; m6: A-D5 322 ; m1: A-D6 323 ; m7: A-D7 324 punpcklwd m0, m2 325 punpcklwd m4, m5 326 punpcklwd m3, m6 327 punpcklwd m1, m7 328 ; m0: A0-1,B0-1,C0-1,D0-1 329 ; m4: A2-3,B2-3,C2-3,D2-3 330 ; m3: A4-5,B4-5,C4-5,D4-5 331 ; m1: A6-7,B6-7,C6-7,D6-7 332 punpckhdq m2, m0, m4 333 punpckldq m0, m4 334 punpckhdq m4, m3, m1 335 punpckldq m3, m1 336 ; m0: A0-3,B0-3 337 ; m2: C0-3,D0-3 338 ; m3: A4-7,B4-7 339 ; m4: C4-7,D4-7 340 punpckhqdq m1, m0, m3 341 punpcklqdq m0, m3 342 punpckhqdq m3, m2, m4 343 punpcklqdq m2, m4 344 ; m0: A0-7 345 ; m1: B0-7 346 ; m2: C0-7 347 ; m3: D0-7 348%if ARCH_X86_64 349 SWAP 0, 8 350 SWAP 1, 9 351 SWAP 2, 10 352 SWAP 3, 11 353%define P1 m8 354%define P0 m9 355%define Q0 m10 356%define Q1 m11 357%else 358%define P1 [esp+3*mmsize] 359%define P0 [esp+4*mmsize] 360%define Q0 [esp+5*mmsize] 361%define Q1 [esp+6*mmsize] 362 mova P1, m0 363 mova P0, m1 364 mova Q0, m2 365 mova Q1, m3 366%endif 367%elif %1 == 6 || %1 == 8 368 movu m0, [dstq+strideq*0-8] 369 movu m1, [dstq+strideq*1-8] 370 movu m2, [dstq+strideq*2-8] 371 movu m3, [dstq+stride3q -8] 372 lea tmpq, [dstq+strideq*4] 373 movu m4, [tmpq+strideq*0-8] 374 movu m5, [tmpq+strideq*1-8] 375 movu m6, [tmpq+strideq*2-8] 376%if ARCH_X86_64 377 movu m7, [tmpq+stride3q -8] 378%endif 379 380 ; transpose 8x16 381 ; m0: A-H0,A-H8 382 ; m1: A-H1,A-H9 383 ; m2: A-H2,A-H10 384 ; m3: A-H3,A-H11 385 ; m4: A-H4,A-H12 386 ; m5: A-H5,A-H13 387 ; m6: A-H6,A-H14 388 ; m7: A-H7,A-H15 389%if ARCH_X86_64 390 punpcklwd m8, m0, m1 391%else 392 punpcklwd m7, m0, m1 393%endif 394 punpckhwd m0, m1 395 punpcklwd m1, m2, m3 396 punpckhwd m2, m3 397 punpcklwd m3, m4, m5 398 punpckhwd m4, m5 399%if ARCH_X86_64 400 punpcklwd m5, m6, m7 401 punpckhwd m6, m7 402%else 403 mova [rsp+3*16], m4 404 movu m4, [tmpq+stride3q -8] 405 punpcklwd m5, m6, m4 406 punpckhwd m6, m4 407%endif 408 ; m8: A0-1,B0-1,C0-1,D0-1 [m7 on x86-32] 409 ; m0: E0-1,F0-1,G0-1,H0-1 410 ; m1: A2-3,B2-3,C2-3,D2-3 411 ; m2: E2-3,F2-3,G2-3,H2-3 412 ; m3: A4-5,B4-5,C4-5,D4-5 413 ; m4: E4-5,F4-5,G4-5,H4-5 [r3 on x86-32] 414 ; m5: A6-7,B6-7,C6-7,D6-7 415 ; m6: E6-7,F6-7,G6-7,H6-7 416%if ARCH_X86_64 417 punpckldq m7, m8, m1 418 punpckhdq m8, m1 419%else 420 punpckldq m4, m7, m1 421 punpckhdq m7, m1 422%endif 423 punpckldq m1, m0, m2 424 punpckhdq m0, m2 425 punpckldq m2, m3, m5 426 punpckhdq m3, m5 427%if ARCH_X86_64 428 punpckldq m5, m4, m6 429 punpckhdq m4, m6 430%else 431 mova [rsp+4*16], m3 432 mova m3, [rsp+3*16] 433 punpckldq m5, m3, m6 434 punpckhdq m3, m6 435%endif 436 ; m7: A0-3,B0-3 [m4 on x86-32] 437 ; m8: C0-3,D0-3 [m7 on x86-32] 438 ; m1: E0-3,F0-3 439 ; m0: G0-3,H0-3 440 ; m2: A4-7,B4-7 441 ; m3: C4-7,D4-7 [r4 on x86-32] 442 ; m5: E4-7,F4-7 443 ; m4: G4-7,H4-7 [m3 on x86-32] 444%if ARCH_X86_64 445%if %1 != 6 446 punpcklqdq m6, m7, m2 447%endif 448 punpckhqdq m7, m2 449 punpcklqdq m2, m8, m3 450 punpckhqdq m8, m3 451 punpcklqdq m3, m1, m5 452 punpckhqdq m1, m5 453%if %1 != 6 454 punpckhqdq m5, m0, m4 455%endif 456 punpcklqdq m0, m4 457%if %1 == 8 458 mova [rsp+1*16], m6 459%define P3 [rsp+1*16] 460%endif 461 ; 7,2,8,3,1,0,5 -> 13,8,9,10,11,14,15 462 SWAP 7, 13 463 SWAP 8, 2, 9 464 SWAP 3, 10 465 SWAP 1, 11 466 SWAP 0, 14 467 SWAP 5, 15 468%define P2 m13 469%define P1 m8 470%define P0 m9 471%define Q0 m10 472%define Q1 m11 473%define Q2 m14 474%if %1 == 8 475%define Q3 m15 476%endif 477%else ; x86-32 478%if %1 == 8 479%define P3 [rsp+ 6*16] 480 punpcklqdq m6, m4, m2 481 mova P3, m6 482%endif 483 mova m6, [rsp+4*16] 484 punpckhqdq m4, m2 485 punpcklqdq m2, m7, m6 486 punpckhqdq m7, m6 487 punpcklqdq m6, m1, m5 488 punpckhqdq m1, m5 489%if %1 == 8 490%define Q3 [rsp+24*16] 491 punpckhqdq m5, m0, m3 492 mova Q3, m5 493%endif 494 punpcklqdq m0, m3 495%if %1 == 8 496%define P2 [rsp+18*16] 497%define P1 [rsp+19*16] 498%define P0 [rsp+20*16] 499%define Q0 [rsp+21*16] 500%define Q1 [rsp+22*16] 501%define Q2 [rsp+23*16] 502%else 503%define P2 [rsp+3*16] 504%define P1 [rsp+4*16] 505%define P0 [rsp+5*16] 506%define Q0 [rsp+6*16] 507%define Q1 [rsp+7*16] 508%define Q2 [rsp+8*16] 509%endif 510 mova P2, m4 511 mova P1, m2 512 mova P0, m7 513 mova Q0, m6 514 mova Q1, m1 515 mova Q2, m0 516%endif ; x86-32/64 517%else ; %1 == 16 518 ; We only use 14 pixels but we'll need the remainder at the end for 519 ; the second transpose 520 mova m0, [dstq+strideq*0-16] 521 mova m1, [dstq+strideq*1-16] 522 mova m2, [dstq+strideq*2-16] 523 mova m3, [dstq+stride3q -16] 524 lea tmpq, [dstq+strideq*4] 525 mova m4, [tmpq+strideq*0-16] 526 mova m5, [tmpq+strideq*1-16] 527 mova m6, [tmpq+strideq*2-16] 528%if ARCH_X86_64 529 mova m7, [tmpq+stride3q -16] 530 531 TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8 532 SWAP 5, 13 533 SWAP 6, 8 534 SWAP 7, 9 535%define P2 m13 536%define P1 m8 537%define P0 m9 538%else ; x86-32 539%define P2 [esp+18*16] 540%define P1 [esp+19*16] 541%define P0 [esp+20*16] 542 TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ 543 [tmpq+stride3q -16], P2, "", a, a 544 mova P1, m6 545 mova P0, m7 546%endif ; x86-32/64 547 mova [rsp+ 7*16], m0 548 mova [rsp+ 8*16], m1 549 mova [rsp+ 9*16], m2 550 mova [rsp+10*16], m3 551%define P3 [rsp+6*16] 552 mova P3, m4 553 554 mova m0, [dstq+strideq*0] 555 mova m1, [dstq+strideq*1] 556 mova m2, [dstq+strideq*2] 557 mova m3, [dstq+stride3q ] 558 lea tmpq, [dstq+strideq*4] 559 mova m4, [tmpq+strideq*0] 560 mova m5, [tmpq+strideq*1] 561 mova m6, [tmpq+strideq*2] 562%if ARCH_X86_64 563 mova m7, [tmpq+stride3q ] 564 565 TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 10 566 SWAP 0, 10 567 SWAP 1, 11 568 SWAP 2, 14 569 SWAP 3, 15 570%define Q0 m10 571%define Q1 m11 572%define Q2 m14 573%define Q3 m15 574%else ; x86-32 575 TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ 576 [tmpq+stride3q ], [rsp+12*16], "", a, a 577%define Q0 [esp+21*16] 578%define Q1 [esp+22*16] 579%define Q2 [esp+23*16] 580%define Q3 [esp+24*16] 581 mova Q0, m0 582 mova Q1, m1 583 mova Q2, m2 584 mova Q3, m3 585%endif ; x86-32/64 586 587 mova [rsp+11*16], m4 588%if ARCH_X86_64 589 mova [rsp+12*16], m5 590%endif 591 mova [rsp+13*16], m6 592 mova [rsp+14*16], m7 593%endif ; %1 == 4/6/8/16 594%endif ; %2 ==/!= v 595 596 ; load L/E/I/H 597%if ARCH_X86_32 598%define l_strideq r5 599 mov l_strideq, dword lstridem 600%ifidn %2, v 601%define lq r3 602 mov lq, dword lm 603%endif 604%endif 605%ifidn %2, v 606%if cpuflag(sse4) 607 pmovzxbw m1, [lq] 608 pmovzxbw m0, [lq+l_strideq] 609 pxor m2, m2 610%else ; ssse3 611 movq m1, [lq] 612 movq m0, [lq+l_strideq] 613 pxor m2, m2 614 REPX {punpcklbw x, m2}, m1, m0 615%endif ; ssse3/sse4 616%else ; %2 != v 617 movq m0, [lq] ; l0, l1 618 movq m1, [lq+l_strideq] ; l2, l3 619 punpckldq m0, m1 ; l0, l2, l1, l3 620 pxor m2, m2 621 punpcklbw m1, m0, m2 ; l0, l2 622 punpckhbw m0, m2 ; l1, l3 623%endif ; %2==/!=v 624%if ARCH_X86_32 625%ifidn %2, v 626%undef lq 627 mov mstrideq, mstridem 628%endif 629%endif 630 pcmpeqw m5, m2, m0 631 pand m1, m5 632 por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] 633 pshufb m0, [PIC_sym(pb_4x1_4x5_4x9_4x13)] ; l[x][1] 634 pcmpeqw m5, m2, m0 ; !L 635 psrlw m5, 1 636%if ARCH_X86_64 637 psrlw m2, m0, [lutq+128] 638 SPLATW m1, [lutq+136] 639%else ; x86-32 640 mov r5, lutm 641 psrlw m2, m0, [r5+128] 642 SPLATW m1, [r5+136] 643%endif ; x86-32/64 644 pminsw m2, m1 645 pmaxsw m2, [PIC_sym(pw_1)] ; I 646 psrlw m1, m0, 4 ; H 647 paddw m0, [PIC_sym(pw_2)] 648 paddw m0, m0 649 paddw m0, m2 ; E 650 REPX {pmullw x, [bdmulq]}, m0, m1, m2 651%if ARCH_X86_32 652%undef l_strideq 653 lea stride3q, [strideq*3] 654%endif 655 656 psubw m3, P1, P0 ; p1-p0 657 psubw m4, Q0, Q1 ; q0-q1 658 REPX {pabsw x, x}, m3, m4 659 pmaxsw m3, m5 660 pmaxsw m3, m4 661 pcmpgtw m7, m3, m1 ; hev 662%if %1 != 4 663 psubw m4, P2, P0 ; p2-p0 664 pabsw m4, m4 665 pmaxsw m4, m3 666%if %1 != 6 667 mova m6, P3 ; p3 668 psubw m5, m6, P0 ; p3-p0 669 pabsw m5, m5 670 pmaxsw m4, m5 671%endif ; %1 != 6 672 psubw m5, Q0, Q2 ; q0-q2 673 pabsw m5, m5 674 pmaxsw m4, m5 675%if %1 != 6 676 psubw m5, Q0, Q3 ; q0-q3 677 pabsw m5, m5 678 pmaxsw m4, m5 679%endif ; %1 != 6 680 pcmpgtw m4, [bdmulq] ; !flat8in 681 682 psubw m5, P2, P1 ; p2-p1 683 pabsw m5, m5 684%if %1 != 6 685 psubw m6, P2 ; p3-p2 686 pabsw m6, m6 687 pmaxsw m5, m6 688 psubw m6, Q2, Q3 ; q2-q3 689 pabsw m6, m6 690 pmaxsw m5, m6 691%endif ; %1 != 6 692 psubw m6, Q2, Q1 ; q2-q1 693 pabsw m6, m6 694 pmaxsw m5, m6 695 696%if %1 == 16 697 SPLATD m6, [maskq+8] 698 SPLATD m1, [maskq+4] 699 por m6, m1 700 pand m6, m12 701 pcmpeqd m6, m12 702 pand m5, m6 703%else ; %1 != 16 704 SPLATD m6, [maskq+4] 705 pand m6, m12 706 pcmpeqd m6, m12 707 pand m5, m6 ; only apply fm-wide to wd>4 blocks 708%endif ; %1==/!=16 709 pmaxsw m3, m5 710%endif ; %1 != 4 711 pcmpgtw m3, m2 712 713 psubw m5, P1, Q1 ; p1-q1 714 psubw m6, P0, Q0 ; p0-q0 715 REPX {pabsw x, x}, m5, m6 716 paddw m6, m6 717 psrlw m5, 1 718 paddw m5, m6 ; abs(p0-q0)*2+(abs(p1-q1)>>1) 719 pcmpgtw m5, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E 720 por m3, m5 721 722%if %1 == 16 723 724%ifidn %2, v 725 lea tmpq, [dstq+mstrideq*8] 726 mova m0, [tmpq+strideq*1] 727 mova m1, [tmpq+strideq*2] 728 mova m2, [tmpq+stride3q] 729%else ; %2 != v 730 mova m0, [rsp+ 8*16] 731 mova m1, [rsp+ 9*16] 732 mova m2, [rsp+10*16] 733%endif ; %2==/!=v 734 REPX {psubw x, P0}, m0, m1, m2 735 REPX {pabsw x, x}, m0, m1, m2 736 pmaxsw m1, m0 737 pmaxsw m1, m2 738%ifidn %2, v 739 lea tmpq, [dstq+strideq*4] 740 mova m0, [tmpq+strideq*0] 741 mova m2, [tmpq+strideq*1] 742 mova m5, [tmpq+strideq*2] 743%else ; %2 != v 744 mova m0, [rsp+11*16] 745 mova m2, [rsp+12*16] 746 mova m5, [rsp+13*16] 747%endif ; %2==/!=v 748 REPX {psubw x, Q0}, m0, m2, m5 749 REPX {pabsw x, x}, m0, m2, m5 750 pmaxsw m0, m2 751 pmaxsw m1, m5 752 pmaxsw m1, m0 753 pcmpgtw m1, [bdmulq] ; !flat8out 754 por m1, m4 ; !flat8in | !flat8out 755 SPLATD m2, [maskq+8] 756 pand m5, m2, m12 757 pcmpeqd m5, m12 758 pandn m1, m5 ; flat16 759 pandn m5, m3, m1 ; flat16 & fm 760 SWAP 1, 5 761 762 SPLATD m5, [maskq+4] 763 por m5, m2 764 pand m2, m5, m12 765 pcmpeqd m2, m12 766 pandn m4, m2 ; flat8in 767 pandn m2, m3, m4 768 SWAP 2, 4 769 SPLATD m2, [maskq+0] 770 por m2, m5 771 pand m2, m12 772 pcmpeqd m2, m12 773 pandn m3, m2 774 pandn m0, m4, m3 ; fm & !flat8 & !flat16 775 SWAP 0, 3 776 pandn m0, m1, m4 ; flat8 & !flat16 777 SWAP 0, 4 778%elif %1 != 4 779 SPLATD m0, [maskq+4] 780 pand m2, m0, m12 781 pcmpeqd m2, m12 782 pandn m4, m2 783 pandn m2, m3, m4 ; flat8 & fm 784 SWAP 2, 4 785 SPLATD m2, [maskq+0] 786 por m0, m2 787 pand m0, m12 788 pcmpeqd m0, m12 789 pandn m3, m0 790 pandn m0, m4, m3 ; fm & !flat8 791 SWAP 0, 3 792%else ; %1 == 4 793 SPLATD m0, [maskq+0] 794 pand m0, m12 795 pcmpeqd m0, m12 796 pandn m3, m0 ; fm 797%endif ; %1==/!=4 798 799 ; short filter 800%if ARCH_X86_64 801 SPLATW m0, r7m 802%else 803 SPLATW m0, bdmulm 804%endif 805 pcmpeqw m2, m2 806 psrlw m0, 1 ; 511 or 2047 807 pxor m2, m0 ; -512 or -2048 808 809 psubw m5, Q0, P0 ; q0-p0 810 paddw m6, m5, m5 811 paddw m6, m5 ; 3*(q0-p0) 812 psubw m5, P1, Q1 ; iclip_diff(p1-q1) 813 pminsw m5, m0 814 pmaxsw m5, m2 815 pand m5, m7 ; f=iclip_diff(p1-q1)&hev 816 paddw m5, m6 ; f=iclip_diff(3*(q0-p0)+f) 817 pminsw m5, m0 818 pmaxsw m5, m2 819 pand m3, m5 ; f&=fm 820 paddw m5, m3, [PIC_sym(pw_3)] 821 paddw m3, [PIC_sym(pw_4)] 822 REPX {pminsw x, m0}, m5, m3 823 psraw m5, 3 ; f2 824 psraw m3, 3 ; f1 825 psubw m0, m2 ; 1023 or 4095 826 pxor m2, m2 827%if ARCH_X86_64 828 paddw P0, m5 829 psubw Q0, m3 830%else 831 paddw m5, P0 832 psubw m6, Q0, m3 833 REPX {pminsw x, m0}, m5, m6 834 REPX {pmaxsw x, m2}, m5, m6 835%endif 836 837 paddw m3, [PIC_sym(pw_1)] 838 psraw m3, 1 ; f=(f1+1)>>1 839 pandn m7, m3 ; f&=!hev 840 SWAP 7, 3 841%if ARCH_X86_64 842 paddw P1, m3 843 psubw Q1, m3 844 REPX {pminsw x, m0}, P1, P0, Q0, Q1 845 REPX {pmaxsw x, m2}, P1, P0, Q0, Q1 846%else 847 psubw m7, Q1, m3 848 paddw m3, P1 849 REPX {pminsw x, m0}, m7, m3 850 REPX {pmaxsw x, m2}, m7, m3 851%if %1 > 4 852 mova P1, m3 853 mova P0, m5 854 mova Q0, m6 855 mova Q1, m7 856%endif 857%endif 858 859%if %1 == 16 860 861; m8-11 = p1/p0/q0/q1, m4=flat8, m1=flat16 862; m12=filter bits mask 863; m13-15=p2/q2/q3 864; m0,2-3,5-7 = free 865 866 ; flat16 filter 867%ifidn %2, v 868 lea tmpq, [dstq+mstrideq*8] 869 mova m0, [tmpq+strideq*1] ; p6 870 mova m2, [tmpq+strideq*2] ; p5 871 mova m7, [tmpq+stride3q] ; p4 872 mova m6, [tmpq+strideq*4] ; p3 873 lea tmpq, [dstq+mstrideq*4] 874%else ; %2 != v 875 mova m0, [rsp+ 8*16] 876 mova m2, [rsp+ 9*16] 877 mova m7, [rsp+10*16] 878 mova m6, [rsp+ 6*16] 879%endif ; %2==/!=v 880 881 mova [rsp+ 0*16], m4 882 883 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 884 psllw m3, m0, 3 ; p6*8 885 paddw m3, [PIC_sym(pw_8)] 886 paddw m5, m2, m7 ; p5+p4 887 psubw m3, m0 888 paddw m5, m5 ; (p5+p4)*2 889 paddw m3, m6 ; p6*7+p3 890 paddw m5, P2 ; (p5+p4)*2+p2 891 paddw m3, P1 ; p6*7+p3+p1 892 paddw m5, P0 ; (p5+p4)*2+p2+p0 893 paddw m3, Q0 ; p6*7+p3+p1+q0 894 paddw m3, m5 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 895 psrlw m5, m3, 4 896 pand m5, m1 897 pandn m4, m1, m2 898 por m5, m4 899%ifidn %2, v 900 mova [tmpq+mstrideq*2], m5 ; p5 901%else ; %2 != v 902 mova [rsp+9*16], m5 903%endif ; %2==/!=v 904 905 ; sub p6*2, add p3/q1 906 paddw m3, m6 907 paddw m5, m0, m0 908 paddw m3, Q1 909 psubw m3, m5 910 psrlw m5, m3, 4 911 pand m5, m1 912 pandn m4, m1, m7 913 por m5, m4 914%ifidn %2, v 915 mova [tmpq+mstrideq*1], m5 ; p4 916%else ; %2 != v 917 mova [rsp+10*16], m5 918%endif ; %2==/!=v 919 920 ; sub p6/p5, add p2/q2 921 psubw m3, m0 922 paddw m5, P2, Q2 923 psubw m3, m2 924 paddw m3, m5 925 psrlw m5, m3, 4 926 pand m5, m1 927 pandn m4, m1, m6 928 por m5, m4 929%ifidn %2, v 930 mova [tmpq+strideq*0], m5 ; p3 931%else ; %2 != v 932 mova [rsp+6*16], m5 933%endif ; %2==/!=v 934 935%define WRITE_IN_PLACE 0 936%ifidn %2, v 937%if ARCH_X86_64 938%define WRITE_IN_PLACE 1 939%endif 940%endif 941 942 ; sub p6/p4, add p1/q3 943 paddw m3, P1 944 paddw m5, m0, m7 945 paddw m3, Q3 946 psubw m3, m5 947 psrlw m5, m3, 4 948 pand m5, m1 949 pandn m4, m1, P2 950 por m5, m4 951%if WRITE_IN_PLACE 952 mova [tmpq+strideq*1], m5 953%else 954 mova [rsp+1*16], m5 ; don't clobber p2/m13 955%endif 956 957 ; sub p6/p3, add p0/q4 958 paddw m3, P0 959 paddw m5, m0, m6 960%ifidn %2, v 961 paddw m3, [dstq+strideq*4] 962%else ; %2 != v 963 paddw m3, [rsp+11*16] 964%endif ; %2==/!=v 965 psubw m3, m5 966 psrlw m5, m3, 4 967 pand m5, m1 968 pandn m4, m1, P1 969 por m5, m4 970%if WRITE_IN_PLACE 971 mova [dstq+mstrideq*2], m5 972%else 973 mova [rsp+2*16], m5 ; don't clobber p1/m3 974%endif 975 976 ; sub p6/p2, add q0/q5 977 paddw m3, Q0 978 paddw m5, m0, P2 979%ifidn %2, v 980%if ARCH_X86_32 981 lea r4, P2 982%endif 983 lea tmpq, [dstq+strideq*4] 984 paddw m3, [tmpq+strideq*1] 985%else ; %2 != v 986 paddw m3, [rsp+12*16] 987%endif ; %2==/!=v 988 psubw m3, m5 989 psrlw m5, m3, 4 990 pand m5, m1 991 pandn m4, m1, P0 992 por m5, m4 993%if WRITE_IN_PLACE 994 mova [dstq+mstrideq*1], m5 995%else 996 mova [rsp+3*16], m5 ; don't clobber p0/m4 997%endif 998 999 ; sub p6/p1, add q1/q6 1000 paddw m3, Q1 1001 paddw m5, m0, P1 1002%ifidn %2, v 1003 mova m0, [tmpq+strideq*2] ; q6 1004%else ; %2 != v 1005 mova m0, [rsp+13*16] ; q6 1006%endif ; %2==/!=v 1007 paddw m3, m0 1008 psubw m3, m5 1009 psrlw m5, m3, 4 1010 pand m5, m1 1011 pandn m4, m1, Q0 1012 por m5, m4 1013%if WRITE_IN_PLACE 1014 mova [dstq], m5 1015%else 1016 mova [rsp+4*16], m5 ; don't clobber q0/m5 1017%endif 1018 1019 ; sub p5/p0, add q2/q6 1020 paddw m3, Q2 1021 paddw m5, m2, P0 1022 paddw m3, m0 1023 psubw m3, m5 1024 psrlw m5, m3, 4 1025 pand m5, m1 1026 pandn m4, m1, Q1 1027 por m2, m5, m4 ; don't clobber q1/m6 1028 1029 ; sub p4/q0, add q3/q6 1030 paddw m3, Q3 1031 paddw m7, Q0 1032 paddw m3, m0 1033 psubw m3, m7 1034 psrlw m7, m3, 4 1035 pand m7, m1 1036 pandn m4, m1, Q2 1037 por m7, m4 ; don't clobber q2/m14 1038 1039 ; sub p3/q1, add q4/q6 1040%ifidn %2, v 1041 paddw m3, [tmpq+strideq*0] 1042%else ; %2 != v 1043 paddw m3, [rsp+11*16] 1044%endif ; %2==/!=v 1045 paddw m6, Q1 1046 paddw m3, m0 1047 psubw m3, m6 1048 psrlw m6, m3, 4 1049 pand m6, m1 1050 pandn m4, m1, Q3 1051 por m6, m4 1052%if WRITE_IN_PLACE 1053 mova [tmpq+mstrideq], m6 ; q3 1054%else ; %2 != v 1055 mova [rsp+5*16], m6 1056%endif ; %2==/!=v 1057 1058 ; sub p2/q2, add q5/q6 1059%ifidn %2, v 1060 paddw m3, [tmpq+strideq*1] 1061%if ARCH_X86_64 1062 paddw m5, P2, Q2 1063%else 1064 ; because tmpq is clobbered, so we use a backup pointer for P2 instead 1065 paddw m5, [r4], Q2 1066 mov pic_regq, pic_regm 1067%endif 1068%else ; %2 != v 1069 paddw m3, [rsp+12*16] 1070 paddw m5, P2, Q2 1071%endif ; %2==/!=v 1072 paddw m3, m0 1073 psubw m3, m5 1074 psrlw m5, m3, 4 1075 pand m5, m1 1076%ifidn %2, v 1077 pandn m4, m1, [tmpq+strideq*0] 1078%else ; %2 != v 1079 pandn m4, m1, [rsp+11*16] 1080%endif ; %2==/!=v 1081 por m5, m4 1082%ifidn %2, v 1083 mova [tmpq+strideq*0], m5 ; q4 1084%else ; %2 != v 1085 mova [rsp+11*16], m5 1086%endif ; %2==/!=v 1087 1088 ; sub p1/q3, add q6*2 1089 psubw m3, P1 1090 paddw m0, m0 1091 psubw m3, Q3 1092 paddw m3, m0 1093 psrlw m5, m3, 4 1094 pand m5, m1 1095%ifidn %2, v 1096 pandn m4, m1, [tmpq+strideq*1] 1097%else ; %2 != v 1098 pandn m4, m1, [rsp+12*16] 1099%endif ; %2==/!=v 1100 por m5, m4 1101%ifidn %2, v 1102 mova [tmpq+strideq*1], m5 ; q5 1103%else ; %2 != v 1104 mova [rsp+12*16], m5 1105%endif ; %2==/!=v 1106 1107 mova m4, [rsp+0*16] 1108%ifidn %2, v 1109 lea tmpq, [dstq+mstrideq*4] 1110%endif 1111%if ARCH_X86_64 1112 SWAP 2, 11 1113 SWAP 7, 14 1114 SWAP 6, 15 1115%else ; x86-32 1116 mova Q1, m2 1117 mova Q2, m7 1118%endif ; x86-32/64 1119%if WRITE_IN_PLACE 1120 mova P2, [tmpq+strideq*1] 1121 mova P1, [tmpq+strideq*2] 1122 mova P0, [tmpq+stride3q] 1123 mova Q0, [dstq] 1124%elif ARCH_X86_64 1125 mova P2, [rsp+1*16] 1126 mova P1, [rsp+2*16] 1127 mova P0, [rsp+3*16] 1128 mova Q0, [rsp+4*16] 1129%else ; !WRITE_IN_PLACE & x86-32 1130 mova m0, [rsp+1*16] 1131 mova m1, [rsp+2*16] 1132 mova m2, [rsp+3*16] 1133 mova m3, [rsp+4*16] 1134 mova m7, [rsp+5*16] 1135 mova P2, m0 1136 mova P1, m1 1137 mova P0, m2 1138 mova Q0, m3 1139 mova Q3, m7 1140%endif ; WRITE_IN_PLACE / x86-32/64 1141%undef WRITE_IN_PLACE 1142%endif ; %1 == 16 1143 1144%if %1 >= 8 1145 1146 ; flat8 filter 1147 mova m0, P3 ; p3 1148 paddw m1, m0, P2 ; p3+p2 1149 paddw m2, P1, P0 ; p1+p0 1150 paddw m3, m1, m1 ; 2*(p3+p2) 1151 paddw m2, m0 ; p1+p0+p3 1152 paddw m3, Q0 ; 2*(p3+p2)+q0 1153 paddw m2, m3 ; 3*p3+2*p2+p1+p0+q0 1154 pmulhrsw m7, m2, [PIC_sym(pw_4096)] 1155 psubw m7, P2 1156 pand m7, m4 1157 1158 paddw m3, P1, Q1 ; p1+q1 1159 psubw m2, m1 ; 2*p3+p2+p1+p0+q0 1160 paddw m2, m3 ; 2*p3+p2+2*p1+p0+q0+q1 1161 pmulhrsw m3, m2, [PIC_sym(pw_4096)] 1162 psubw m3, P1 1163 pand m3, m4 1164 1165 paddw m5, m0, P1 ; p3+p1 1166 paddw m6, P0, Q2 ; p0+q2 1167 psubw m2, m5 ; p3+p2+p1+p0+q0+q1 1168 paddw m2, m6 ; p3+p2+p1+2*p0+q0+q1+q2 1169 pmulhrsw m5, m2, [PIC_sym(pw_4096)] 1170 psubw m5, P0 1171 pand m5, m4 1172 1173 paddw m6, m0, P0 ; p3+p0 1174 paddw m1, Q0, Q3 ; q0+q3 1175 psubw m2, m6 ; p2+p1+p0+q0+q1+q2 1176 paddw m2, m1 ; p2+p1+p0+2*q0+q1+q2+q3 1177 pmulhrsw m6, m2, [PIC_sym(pw_4096)] 1178 psubw m6, Q0 1179 pand m6, m4 1180 1181 paddw m2, Q1 ; p2+p1+p0+2*q0+2*q1+q2+q3 1182 paddw m2, Q3 ; p2+p1+p0+2*q0+2*q1+q2+2*q3 1183 paddw m1, P2, Q0 ; p2+q0 1184 psubw m2, m1 ; p1+p0+q0+2*q1+q2+2*q3 1185 pmulhrsw m1, m2, [PIC_sym(pw_4096)] 1186 psubw m1, Q1 1187 pand m1, m4 1188 1189 psubw m2, P1 ; p0+q0+2*q1+q2+2*q3 1190 psubw m2, Q1 ; p0+q0+q1+q2+2*q3 1191 paddw m0, Q3, Q2 ; q3+q2 1192 paddw m2, m0 ; p0+q0+q1+2*q2+3*q3 1193 pmulhrsw m2, [PIC_sym(pw_4096)] 1194 psubw m2, Q2 1195 pand m2, m4 1196 1197 paddw m7, P2 1198 paddw m3, P1 1199 paddw m5, P0 1200 paddw m6, Q0 1201 paddw m1, Q1 1202 paddw m2, Q2 1203 1204%ifidn %2, v 1205 mova [tmpq+strideq*1], m7 ; p2 1206 mova [tmpq+strideq*2], m3 ; p1 1207 mova [tmpq+stride3q ], m5 ; p0 1208 mova [dstq+strideq*0], m6 ; q0 1209 mova [dstq+strideq*1], m1 ; q1 1210 mova [dstq+strideq*2], m2 ; q2 1211%else ; %2 != v 1212 mova m0, P3 1213 1214%if %1 == 8 1215 lea tmpq, [dstq+strideq*4] 1216%if ARCH_X86_64 1217 SWAP 4, 15 1218 TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, 8 1219%else 1220 TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, "", \ 1221 Q3, [tmpq+strideq*1-8], a, u 1222%endif 1223 1224 ; write 8x8 1225 movu [dstq+strideq*0-8], m0 1226 movu [dstq+strideq*1-8], m7 1227 movu [dstq+strideq*2-8], m3 1228 movu [dstq+stride3q -8], m5 1229 movu [tmpq+strideq*0-8], m6 1230%if ARCH_X86_64 1231 movu [tmpq+strideq*1-8], m1 1232%endif 1233 movu [tmpq+strideq*2-8], m2 1234 movu [tmpq+stride3q -8], m4 1235 lea dstq, [dstq+strideq*8] 1236%else ; %1 != 8 1237%if ARCH_X86_64 1238 SWAP 6, 8 1239 SWAP 1, 9 1240 SWAP 2, 10 1241%else 1242 mova [rsp+1*16], m6 1243 mova [rsp+2*16], m1 1244 mova [rsp+3*16], m2 1245%endif 1246 1247 mova m1, [rsp+ 7*16] 1248 mova m2, [rsp+ 8*16] 1249 mova m4, [rsp+ 9*16] 1250 mova m6, [rsp+10*16] 1251 lea tmpq, [dstq+strideq*4] 1252%if ARCH_X86_64 1253 TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, 11 1254%else 1255 mova [rsp+7*16], m5 1256 TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, "", \ 1257 [rsp+7*16], [tmpq+strideq*1-16], a, a 1258%endif 1259 1260 mova [dstq+strideq*0-16], m1 1261 mova [dstq+strideq*1-16], m2 1262 mova [dstq+strideq*2-16], m4 1263 mova [dstq+stride3q -16], m6 1264 mova [tmpq+strideq*0-16], m0 1265%if ARCH_X86_64 1266 mova [tmpq+strideq*1-16], m7 1267%endif 1268 mova [tmpq+strideq*2-16], m3 1269 mova [tmpq+stride3q -16], m5 1270 1271%if ARCH_X86_64 1272 SWAP 6, 8 1273 SWAP 1, 9 1274 SWAP 2, 10 1275 SWAP 4, 15 1276%else 1277 mova m6, [rsp+1*16] 1278 mova m1, [rsp+2*16] 1279 mova m2, [rsp+3*16] 1280 mova m4, Q3 1281%endif 1282 mova m0, [rsp+11*16] 1283 mova m3, [rsp+12*16] 1284 mova m5, [rsp+13*16] 1285%if ARCH_X86_64 1286 mova m7, [rsp+14*16] 1287 TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, 8 1288%else 1289 TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, "", \ 1290 [rsp+14*16], [tmpq+strideq*1], a, a 1291%endif 1292 mova [dstq+strideq*0], m6 1293 mova [dstq+strideq*1], m1 1294 mova [dstq+strideq*2], m2 1295 mova [dstq+stride3q ], m4 1296 mova [tmpq+strideq*0], m0 1297%if ARCH_X86_64 1298 mova [tmpq+strideq*1], m3 1299%endif 1300 mova [tmpq+strideq*2], m5 1301 mova [tmpq+stride3q ], m7 1302 lea dstq, [dstq+strideq*8] 1303%endif ; %1==/!=8 1304%endif ; %2==/!=v 1305%elif %1 == 6 1306 ; flat6 filter 1307 paddw m3, P1, P0 ; p1+p0 1308 paddw m3, P2 ; p2+p1+p0 1309 paddw m6, P2, Q0 ; p2+q0 1310 paddw m3, m3 ; 2*(p2+p1+p0) 1311 paddw m3, m6 ; p2+2*(p2+p1+p0)+q0 1312 pmulhrsw m2, m3, [PIC_sym(pw_4096)] 1313 psubw m2, P1 1314 pand m2, m4 1315 1316 paddw m3, Q0 ; p2+2*(p2+p1+p0+q0) 1317 paddw m6, P2, P2 ; 2*p2 1318 paddw m3, Q1 ; p2+2*(p2+p1+p0+q0)+q1 1319 psubw m3, m6 ; p2+2*(p1+p0+q0)+q1 1320 pmulhrsw m5, m3, [PIC_sym(pw_4096)] 1321 psubw m5, P0 1322 pand m5, m4 1323 1324 paddw m3, Q1 ; p2+2*(p1+p0+q0+q1) 1325 paddw m6, P2, P1 ; p2+p1 1326 paddw m3, Q2 ; p2+2*(p1+p0+q0+q1)+q2 1327 psubw m3, m6 ; p1+2*(p0+q0+q1)+q2 1328 pmulhrsw m6, m3, [PIC_sym(pw_4096)] 1329 psubw m6, Q0 1330 pand m6, m4 1331 1332 psubw m3, P1 ; 2*(p0+q0+q1)+q2 1333%if ARCH_X86_64 1334 paddw Q2, Q2 ; q2*2 1335%else 1336 mova m0, Q2 1337 paddw m0, m0 1338%endif 1339 psubw m3, P0 ; p0+2*(q0+q1)+q2 1340%if ARCH_X86_64 1341 paddw m3, Q2 ; p0+q*(q0+q1+q2)+q2 1342%else 1343 paddw m3, m0 1344%endif 1345 pmulhrsw m3, [PIC_sym(pw_4096)] 1346 psubw m3, Q1 1347 pand m3, m4 1348 1349 paddw m2, P1 1350 paddw m5, P0 1351 paddw m6, Q0 1352 paddw m3, Q1 1353 1354%ifidn %2, v 1355 mova [dstq+mstrideq*2], m2 ; p1 1356 mova [dstq+mstrideq*1], m5 ; p0 1357 mova [dstq+strideq*0], m6 ; q0 1358 mova [dstq+strideq*1], m3 ; q1 1359%else ; %2 != v 1360 TRANSPOSE_8x4_AND_WRITE_4x8 m2, m5, m6, m3, m0 1361%endif ; %2==/!=v 1362%else ; %1 == 4 1363%if ARCH_X86_64 1364%ifidn %2, v 1365 mova [dstq+mstrideq*2], P1 ; p1 1366 mova [dstq+mstrideq*1], P0 ; p0 1367 mova [dstq+strideq*0], Q0 ; q0 1368 mova [dstq+strideq*1], Q1 ; q1 1369%else ; %2 != v 1370 TRANSPOSE_8x4_AND_WRITE_4x8 P1, P0, Q0, Q1, m0 1371%endif ; %2==/!=v 1372%else ; x86-32 1373%ifidn %2, v 1374 mova [dstq+mstrideq*2], m3 1375 mova [dstq+mstrideq*1], m5 1376 mova [dstq+strideq*0], m6 1377 mova [dstq+strideq*1], m7 1378%else ; %2 != v 1379 TRANSPOSE_8x4_AND_WRITE_4x8 m3, m5, m6, m7, m0 1380%endif ; %2==/!=v 1381%endif ; x86-32/64 1382%endif ; %1 1383%undef P3 1384%undef P2 1385%undef P1 1386%undef P0 1387%undef Q0 1388%undef Q1 1389%undef Q2 1390%undef Q3 1391%endmacro 1392 1393INIT_XMM ssse3 1394; stack layout: 1395; r0 - flat8 backup inside flat16 code 1396%if ARCH_X86_64 1397cglobal lpf_v_sb_y_16bpc, 6, 12, 16, -16 * 1, \ 1398 dst, stride, mask, l, l_stride, lut, \ 1399 w, stride3, mstride, tmp, mask_bits, bdmul 1400 mov r6d, r7m 1401 sar r6d, 7 1402 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc 1403 lea bdmulq, [pw_4] 1404 add bdmulq, r6 1405 mov wd, wm 1406 shl l_strideq, 2 1407 sub lq, l_strideq 1408%else 1409; stack layout [32bit only]: 1410; r1-4 - p2-q0 post-filter16 1411; r5 - p3 1412; r6 - q3 post-filter16 1413; r7 - GPRs [mask_bitsm, mstridem] 1414; r8 - m12/pb_mask 1415; r9 - bdmulq 1416cglobal lpf_v_sb_y_16bpc, 4, 7, 8, -16 * (10 + extra_stack), \ 1417 dst, stride, mask, mstride, pic_reg, stride3, tmp 1418 RELOC_ARGS v, 10*16 1419%if STACK_ALIGNMENT >= 16 1420 mov r5d, r7m 1421%endif 1422 sar r5d, 7 1423 and r5d, 16 ; 0 for 10bpc, 16 for 12bpc 1424 LEA pic_regq, PIC_base 1425%define pic_regm dword [esp+7*16+2*gprsize] 1426 mov pic_regm, pic_regq 1427 mova m0, [PIC_sym(pw_4)+r5] 1428%define bdmulq esp+9*16 1429 mova [bdmulq], m0 1430 shl dword lstridem, 2 1431 sub r3, dword lstridem 1432 mov dword lm, r3 1433%endif 1434 mov mstrideq, strideq 1435 neg mstrideq 1436 lea stride3q, [strideq*3] 1437%if ARCH_X86_64 1438 mov mask_bitsd, 0x3 1439 mova m12, [pb_mask] 1440%else 1441%define mstridem dword [esp+7*16+1*gprsize] 1442 mov mstridem, mstrideq 1443%define mask_bitsm dword [esp+7*16+0*gprsize] 1444 mov mask_bitsm, 0x3 1445 mova m0, [PIC_sym(pb_mask)] 1446%define m12 [esp+8*16] 1447 mova m12, m0 1448%endif 1449 1450.loop: 1451%if ARCH_X86_64 1452 test [maskq+8], mask_bitsd ; vmask[2] 1453%else 1454 mov r6d, mask_bitsm 1455 test [maskq+8], r6d 1456%endif 1457 jz .no_flat16 1458 1459 FILTER 16, v 1460 jmp .end 1461 1462.no_flat16: 1463%if ARCH_X86_64 1464 test [maskq+4], mask_bitsd ; vmask[1] 1465%else 1466 test [maskq+4], r6d 1467%endif 1468 jz .no_flat 1469 1470 FILTER 8, v 1471 jmp .end 1472 1473.no_flat: 1474%if ARCH_X86_64 1475 test [maskq+0], mask_bitsd ; vmask[0] 1476%else 1477 test [maskq+0], r6d 1478%endif 1479 jz .end 1480 1481 FILTER 4, v 1482 1483.end: 1484%if ARCH_X86_64 1485 pslld m12, 2 1486 add lq, 8 1487%else 1488 mova m0, m12 1489 pslld m0, 2 1490 mova m12, m0 1491 add dword lm, 8 1492%endif 1493 add dstq, 16 1494%if ARCH_X86_64 1495 shl mask_bitsd, 2 1496 sub wd, 2 1497%else 1498 shl mask_bitsm, 2 1499 sub dword wm, 2 1500%endif 1501 jg .loop 1502%undef mask_bitsm 1503%undef bdmulq 1504 UNRELOC_ARGS 1505 RET 1506 1507INIT_XMM ssse3 1508; stack layout: 1509; r0 - flat8 backup inside flat16 1510; r1-4 - p2-q0 post-filter16 backup 1511; r5 - q3 post-filter16 backup 1512; r6 - p3 1513; r7-10 - p7-4 1514; r11-14 - q4-7 1515%if ARCH_X86_64 1516cglobal lpf_h_sb_y_16bpc, 6, 11, 16, -16 * 15, \ 1517 dst, stride, mask, l, l_stride, lut, \ 1518 h, stride3, tmp, mask_bits, bdmul 1519 mov r6d, r7m 1520 sar r6d, 7 1521 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc 1522 lea bdmulq, [pw_4] 1523 add bdmulq, r6 1524 mov hd, hm 1525 shl l_strideq, 2 1526%else 1527; stack layout [32bit only]: 1528; r15 - GPRs [mask_bitsm] 1529; r16 - m12/pb_mask 1530; r17 - bdmulq 1531; r18-24 - p2-q3 1532cglobal lpf_h_sb_y_16bpc, 4, 7, 8, -16 * (25 + extra_stack), \ 1533 dst, stride, mask, l, pic_reg, stride3, tmp 1534 RELOC_ARGS h, 25*16 1535%if STACK_ALIGNMENT >= 16 1536 mov r5d, r7m 1537%endif 1538 sar r5d, 7 1539 and r5d, 16 ; 0 for 10bpc, 16 for 12bpc 1540 LEA pic_regq, PIC_base 1541 mova m0, [PIC_sym(pw_4)+r5] 1542%define bdmulq esp+17*16 1543 mova [bdmulq], m0 1544 shl dword lstridem, 2 1545%endif 1546 sub lq, 4 1547 lea stride3q, [strideq*3] 1548%if ARCH_X86_64 1549 mov mask_bitsd, 0x3 1550 mova m12, [pb_mask] 1551%else 1552%define mask_bitsm dword [esp+15*16+0*gprsize] 1553 mov mask_bitsm, 0x3 1554 mova m0, [PIC_sym(pb_mask)] 1555%define m12 [esp+16*16] 1556 mova m12, m0 1557%endif 1558 1559.loop: 1560%if ARCH_X86_64 1561 test [maskq+8], mask_bitsd ; vmask[2] 1562%else 1563 mov r6d, mask_bitsm 1564 test [maskq+8], r6d 1565%endif 1566 jz .no_flat16 1567 1568 FILTER 16, h 1569 jmp .end 1570 1571.no_flat16: 1572%if ARCH_X86_64 1573 test [maskq+4], mask_bitsd ; vmask[1] 1574%else 1575 test [maskq+4], r6d 1576%endif 1577 jz .no_flat 1578 1579 FILTER 8, h 1580 jmp .end 1581 1582.no_flat: 1583%if ARCH_X86_64 1584 test [maskq+0], mask_bitsd ; vmask[0] 1585%else 1586 test [maskq+0], r6d 1587%endif 1588 jz .no_filter 1589 1590 FILTER 4, h 1591 jmp .end 1592 1593.no_filter: 1594 lea dstq, [dstq+strideq*8] 1595.end: 1596%if ARCH_X86_64 1597 pslld m12, 2 1598 lea lq, [lq+l_strideq*2] 1599 shl mask_bitsd, 2 1600 sub hd, 2 1601%else 1602 mova m0, m12 1603 pslld m0, 2 1604 mova m12, m0 1605 add lq, dword lstridem 1606 add lq, dword lstridem 1607 shl mask_bitsm, 2 1608 sub dword hm, 2 1609%endif 1610 jg .loop 1611%undef mask_bitsm 1612%undef bdmulq 1613 UNRELOC_ARGS 1614 RET 1615 1616INIT_XMM ssse3 1617%if ARCH_X86_64 1618cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ 1619 dst, stride, mask, l, l_stride, lut, \ 1620 w, stride3, mstride, tmp, mask_bits, bdmul 1621 mov r6d, r7m 1622 sar r6d, 7 1623 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc 1624 lea bdmulq, [pw_4] 1625 add bdmulq, r6 1626 mov wd, wm 1627 shl l_strideq, 2 1628 sub lq, l_strideq 1629%else 1630; stack layout [32bit only]: 1631; r0 - GPRs [mask_bitsm, mstridem] 1632; r1 - m12/pb_mask 1633; r2 - bdmulq 1634cglobal lpf_v_sb_uv_16bpc, 4, 7, 8, -16 * (3 + extra_stack), \ 1635 dst, stride, mask, mstride, pic_reg, stride3, tmp 1636 RELOC_ARGS v, 3*16 1637%if STACK_ALIGNMENT >= 16 1638 mov r5d, r7m 1639%endif 1640 sar r5d, 7 1641 and r5d, 16 ; 0 for 10bpc, 16 for 12bpc 1642 LEA pic_regq, PIC_base 1643 mova m0, [PIC_sym(pw_4)+r5] 1644%define bdmulq esp+2*16 1645 mova [bdmulq], m0 1646 shl dword lstridem, 2 1647 sub r3, dword lstridem 1648 mov dword lm, r3 1649%endif 1650 mov mstrideq, strideq 1651 neg mstrideq 1652 lea stride3q, [strideq*3] 1653%if ARCH_X86_64 1654 mov mask_bitsd, 0x3 1655 mova m12, [pb_mask] 1656%else 1657%define mask_bitsm dword [esp+0*gprsize] 1658%define mstridem dword [esp+1*gprsize] 1659 mov mask_bitsm, 0x3 1660 mov mstridem, mstrideq 1661 mova m0, [PIC_sym(pb_mask)] 1662%define m12 [esp+1*16] 1663 mova m12, m0 1664%endif 1665 1666.loop: 1667%if ARCH_X86_64 1668 test [maskq+4], mask_bitsd ; vmask[1] 1669%else 1670 mov r6d, mask_bitsm 1671 test [maskq+4], r6d 1672%endif 1673 jz .no_flat 1674 1675 FILTER 6, v 1676 jmp .end 1677 1678.no_flat: 1679%if ARCH_X86_64 1680 test [maskq+0], mask_bitsd ; vmask[0] 1681%else 1682 test [maskq+0], r6d 1683%endif 1684 jz .end 1685 1686 FILTER 4, v 1687 1688.end: 1689%if ARCH_X86_64 1690 pslld m12, 2 1691 add lq, 8 1692%else 1693 mova m0, m12 1694 pslld m0, 2 1695 mova m12, m0 1696 add dword lm, 8 1697%endif 1698 add dstq, 16 1699%if ARCH_X86_64 1700 shl mask_bitsd, 2 1701 sub wd, 2 1702%else 1703 shl mask_bitsm, 2 1704 sub dword wm, 2 1705%endif 1706 jg .loop 1707%undef mask_bitsm 1708%undef bdmulq 1709 UNRELOC_ARGS 1710 RET 1711 1712INIT_XMM ssse3 1713%if ARCH_X86_64 1714cglobal lpf_h_sb_uv_16bpc, 6, 11, 16, \ 1715 dst, stride, mask, l, l_stride, lut, \ 1716 h, stride3, tmp, mask_bits, bdmul 1717 mov r6d, r7m 1718 sar r6d, 7 1719 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc 1720 lea bdmulq, [pw_4] 1721 add bdmulq, r6 1722 mov hd, hm 1723 shl l_strideq, 2 1724%else 1725; stack layout [32bit only]: 1726; r0 - GPRs [mask_bitsm] 1727; r1 - m12/pb_mask 1728; r2 - bdmulq 1729; r3-8 - p2-q2 1730cglobal lpf_h_sb_uv_16bpc, 4, 7, 8, -16 * (9 + extra_stack), \ 1731 dst, stride, mask, l, pic_reg, stride3, tmp 1732 RELOC_ARGS h, 9*16 1733%if STACK_ALIGNMENT >= 16 1734 mov r5d, r7m 1735%endif 1736 sar r5d, 7 1737 and r5d, 16 ; 0 for 10bpc, 16 for 12bpc 1738 LEA pic_regq, PIC_base 1739 mova m0, [PIC_sym(pw_4)+r5] 1740%define bdmulq esp+2*16 1741 mova [bdmulq], m0 1742 shl dword lstridem, 2 1743%endif 1744 sub lq, 4 1745 lea stride3q, [strideq*3] 1746%if ARCH_X86_64 1747 mov mask_bitsd, 0x3 1748 mova m12, [pb_mask] 1749%else 1750%define mask_bitsm dword [esp+0*gprsize] 1751 mov mask_bitsm, 0x3 1752 mova m0, [PIC_sym(pb_mask)] 1753%define m12 [esp+1*16] 1754 mova m12, m0 1755%endif 1756 1757.loop: 1758%if ARCH_X86_64 1759 test [maskq+4], mask_bitsd ; vmask[1] 1760%else 1761 mov r6d, mask_bitsm 1762 test [maskq+4], r6d 1763%endif 1764 jz .no_flat 1765 1766 FILTER 6, h 1767 jmp .end 1768 1769.no_flat: 1770%if ARCH_X86_64 1771 test [maskq+0], mask_bitsd ; vmask[0] 1772%else 1773 test [maskq+0], r6d 1774%endif 1775 jz .no_filter 1776 1777 FILTER 4, h 1778 jmp .end 1779 1780.no_filter: 1781 lea dstq, [dstq+strideq*8] 1782.end: 1783%if ARCH_X86_64 1784 pslld m12, 2 1785 lea lq, [lq+l_strideq*2] 1786 shl mask_bitsd, 2 1787 sub hd, 2 1788%else 1789 mova m0, m12 1790 pslld m0, 2 1791 mova m12, m0 1792 add lq, dword lstridem 1793 add lq, dword lstridem 1794 shl mask_bitsm, 2 1795 sub dword hm, 2 1796%endif 1797 jg .loop 1798%undef mask_bitsm 1799%undef bdmulq 1800 UNRELOC_ARGS 1801 RET 1802