1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 16 30 31pb_4x0_4x4_4x8_4x12: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 32pb_7_1: times 8 db 7, 1 33pb_3_1: times 8 db 3, 1 34pb_2_1: times 8 db 2, 1 35pb_m1_0: times 8 db -1, 0 36pb_m1_1: times 8 db -1, 1 37pb_m1_2: times 8 db -1, 2 38pb_1: times 16 db 1 39pb_2: times 16 db 2 40pb_3: times 16 db 3 41pb_4: times 16 db 4 42pb_16: times 16 db 16 43pb_63: times 16 db 63 44pb_64: times 16 db 64 45pb_128: times 16 db 0x80 46pb_129: times 16 db 0x81 47pb_240: times 16 db 0xf0 48pb_248: times 16 db 0xf8 49pb_254: times 16 db 0xfe 50 51pw_2048: times 8 dw 2048 52pw_4096: times 8 dw 4096 53 54pd_mask: dd 1, 2, 4, 8 55 56SECTION .text 57 58%macro ABSSUB 4 ; dst, a, b, tmp 59 psubusb %1, %2, %3 60 psubusb %4, %3, %2 61 por %1, %4 62%endmacro 63 64%macro TRANSPOSE_16x4_AND_WRITE_4x16 5 65 ; transpose 16x4 66 punpcklbw m%5, m%1, m%2 67 punpckhbw m%1, m%2 68 punpcklbw m%2, m%3, m%4 69 punpckhbw m%3, m%4 70 punpcklwd m%4, m%5, m%2 71 punpckhwd m%5, m%2 72 punpcklwd m%2, m%1, m%3 73 punpckhwd m%1, m%3 74 75 ; write out 76%assign %%n 0 77%rep 4 78 movd [dstq+strideq *0-2], xm%4 79 movd [dstq+strideq *4-2], xm%5 80 movd [dstq+strideq *8-2], xm%2 81 movd [dstq+stride3q*4-2], xm%1 82 add dstq, strideq 83%if %%n < 3 84 psrldq xm%4, 4 85 psrldq xm%5, 4 86 psrldq xm%2, 4 87 psrldq xm%1, 4 88%endif 89%assign %%n (%%n+1) 90%endrep 91 lea dstq, [dstq+stride3q*4] 92%endmacro 93 94%macro TRANSPOSE_16X16B 2 ; output_transpose, mem 95%if %1 == 0 96 mova %2, m15 ; m7 in 32-bit 97%endif 98 99 ; input in m0-7 100 punpcklbw m15, m0, m1 101 punpckhbw m0, m1 102 punpcklbw m1, m2, m3 103 punpckhbw m2, m3 104 punpcklbw m3, m4, m5 105 punpckhbw m4, m5 106%if ARCH_X86_64 107 SWAP 4, 5, 7 108%else 109 %if %1 == 0 110 mova m5, %2 111 %else 112 mova m5, [esp+1*16] 113 %endif 114 mova %2, m4 115%endif 116 punpcklbw m4, m6, m5 117 punpckhbw m6, m5 118 119 ; interleaved in m15,0,1,2,3,7,4,6 120 punpcklwd m5, m15, m1 121 punpckhwd m15, m1 122 punpcklwd m1, m0, m2 123 punpckhwd m0, m2 124 punpcklwd m2, m3, m4 125 punpckhwd m3, m4 126%if ARCH_X86_64 127 SWAP 3, 4, 7 128%else 129 mova m4, %2 130 mova %2, m3 131%endif 132 punpcklwd m3, m4, m6 133 punpckhwd m4, m6 134 135 ; interleaved in m5,15,1,0,2,7,3,4 136 punpckldq m6, m5, m2 137 punpckhdq m5, m2 138%if ARCH_X86_64 139 SWAP 2, 7, 5 140%else 141 mova m2, %2 142 mova [esp+1*16], m5 143%endif 144 punpckldq m5, m15, m2 145 punpckhdq m15, m2 146 punpckldq m2, m1, m3 147 punpckhdq m1, m3 148 punpckldq m3, m0, m4 149 punpckhdq m0, m4 150 151%if ARCH_X86_32 152 mova [esp+0*16], m6 153 mova [esp+2*16], m5 154 mova [esp+3*16], m15 155 mova [esp+4*16], m2 156 mova [esp+5*16], m1 157 mova [esp+6*16], m3 158 mova [esp+7*16], m0 159 mova m8, [esp+ 8*16] 160 mova m9, [esp+ 9*16] 161 mova m10, [esp+10*16] 162 %if %1 == 0 163 mova m11, [esp+11*16] 164 mova m12, [esp+12*16] 165 mova m13, [esp+13*16] 166 mova m14, [esp+14*16] 167 %else 168 mova m11, [esp+20*16] 169 mova m12, [esp+15*16] 170 mova m13, [esp+16*16] 171 mova m14, [esp+17*16] 172 %endif 173%endif 174 175 ; input in m8-m15 176%if ARCH_X86_64 177 SWAP 7, 4 178%endif 179 punpcklbw m7, m8, m9 180 punpckhbw m8, m9 181 punpcklbw m9, m10, m11 182 punpckhbw m10, m11 183 punpcklbw m11, m12, m13 184 punpckhbw m12, m13 185%if ARCH_X86_64 186 mova m13, %2 187%else 188 %if %1 == 0 189 mova m13, [esp+15*16] 190 %else 191 mova m13, [esp+18*16] 192 %endif 193%endif 194 mova %2, m12 195 punpcklbw m12, m14, m13 196 punpckhbw m14, m14, m13 197 198 ; interleaved in m7,8,9,10,11,rsp%2,12,14 199 punpcklwd m13, m7, m9 200 punpckhwd m7, m9 201 punpcklwd m9, m8, m10 202 punpckhwd m8, m10 203 punpcklwd m10, m11, m12 204 punpckhwd m11, m12 205 mova m12, %2 206 mova %2, m11 207 punpcklwd m11, m12, m14 208 punpckhwd m12, m14 209 210 ; interleaved in m13,7,9,8,10,rsp%2,11,12 211 punpckldq m14, m13, m10 212 punpckhdq m13, m10 213 punpckldq m10, m9, m11 214 punpckhdq m9, m11 215 punpckldq m11, m8, m12 216 punpckhdq m8, m12 217 mova m12, %2 218 mova %2, m8 219 punpckldq m8, m7, m12 220 punpckhdq m7, m12 221 222%if ARCH_X86_32 223 mova [esp+ 8*16], m10 224 mova [esp+ 9*16], m9 225 mova [esp+10*16], m11 226 SWAP 6, 1 227 SWAP 4, 2 228 SWAP 5, 3 229 mova m6, [esp+0*16] 230 mova m4, [esp+1*16] 231 mova m5, [esp+2*16] 232%endif 233 234 ; interleaved in m6,7,5,15,2,1,3,0,14,13,10,9,11,rsp%2,8,7 235 punpcklqdq m12, m6, m14 236 punpckhqdq m6, m14 237 punpcklqdq m14, m4, m13 238 punpckhqdq m4, m13 239 punpcklqdq m13, m5, m8 240 punpckhqdq m5, m8 241%if ARCH_X86_64 242 SWAP 8, 5 243%else 244 mova m8, [esp+3*16] 245 mova [esp+27*16], m5 246 %define m15 m8 247%endif 248 punpcklqdq m5, m15, m7 249 punpckhqdq m15, m7 250 251%if ARCH_X86_32 252 mova [esp+11*16], m12 253 mova [esp+12*16], m6 254 mova [esp+13*16], m14 255 mova [esp+14*16], m4 256 mova [esp+26*16], m13 257 mova [esp+ 0*16], m5 258 mova [esp+ 1*16], m15 259 mova m2, [esp+ 4*16] 260 mova m10, [esp+ 8*16] 261 mova m1, [esp+ 5*16] 262 mova m9, [esp+ 9*16] 263 mova m3, [esp+ 6*16] 264 mova m11, [esp+10*16] 265 mova m0, [esp+ 7*16] 266%endif 267 268 punpcklqdq m7, m2, m10 269 punpckhqdq m2, m10 270 punpcklqdq m10, m1, m9 271 punpckhqdq m1, m9 272 punpcklqdq m9, m3, m11 273 punpckhqdq m3, m11 274 mova m11, %2 275%if ARCH_X86_32 276 %define m12 m3 277%endif 278 mova %2, m12 279 punpcklqdq m12, m0, m11 280 punpckhqdq m0, m11 281%if %1 == 1 282 mova m11, %2 283%endif 284 285%if ARCH_X86_64 286 ; interleaved m11,6,14,4,13,8,5,15,7,2,10,1,9,3,12,0 287 SWAP 0, 11, 1, 6, 5, 8, 7, 15 288 SWAP 2, 14, 12, 9 289 SWAP 3, 4, 13 290%else 291 %if %1 == 0 292 mova [esp+15*16], m9 293 mova [esp+17*16], m12 294 mova [esp+18*16], m0 295 mova [esp+28*16], m10 296 mova [esp+29*16], m1 297 mova m3, [esp+0*16] 298 mova m4, [esp+1*16] 299 SWAP m5, m7 300 SWAP m6, m2 301 %else 302 SWAP 0, 7 303 SWAP 3, 1, 2, 4, 6 304 %endif 305%endif 306%endmacro 307 308%macro FILTER 2 ; width [4/6/8/16], dir [h/v] 309%if ARCH_X86_64 310 %define %%flat8mem [rsp+0*16] 311 %define %%q2mem [rsp+1*16] 312 %define %%q3mem [rsp+2*16] 313%else 314 %if %1 == 4 || %1 == 6 315 %define %%p2mem [esp+ 8*16] 316 %define %%q2mem [esp+ 9*16] 317 %define %%flat8mem [esp+10*16] 318 %else 319 %ifidn %2, v 320 %define %%p2mem [esp+16*16] 321 %define %%q2mem [esp+ 1*16] 322 %define %%q3mem [esp+18*16] 323 %define %%flat8mem [esp+ 0*16] 324 %define %%flat16mem [esp+20*16] 325 %else 326 %define %%p2mem [esp+27*16] 327 %define %%q2mem [esp+28*16] 328 %define %%q3mem [esp+29*16] 329 %define %%flat8mem [esp+21*16] 330 %define %%flat16mem [esp+30*16] 331 %endif 332 %endif 333 %xdefine m12reg m12 334%endif 335 336%if ARCH_X86_32 337 lea stride3q, [strideq*3] 338%endif 339 ; load data 340%ifidn %2, v 341%if ARCH_X86_32 342 mov mstrideq, strideq 343 neg mstrideq 344%endif 345%if %1 == 4 346 lea tmpq, [dstq+mstrideq*2] 347 mova m3, [tmpq+strideq*0] ; p1 348 mova m4, [tmpq+strideq*1] ; p0 349 mova m5, [tmpq+strideq*2] ; q0 350 mova m6, [tmpq+stride3q] ; q1 351%else 352 ; load 6-8 pixels, remainder (for wd=16) will be read inline 353 lea tmpq, [dstq+mstrideq*4] 354 ; we load p3 later 355%define %%p3mem [dstq+mstrideq*4] 356 %if ARCH_X86_32 357 %define m13 m0 358 %define m14 m1 359 %define m15 m2 360 %endif 361 mova m13, [tmpq+strideq*1] 362 mova m3, [tmpq+strideq*2] 363 mova m4, [tmpq+stride3q] 364 mova m5, [dstq+strideq*0] 365 mova m6, [dstq+strideq*1] 366 mova m14, [dstq+strideq*2] 367%if %1 != 6 368 mova m15, [dstq+stride3q] 369%endif 370 %if ARCH_X86_32 371 mova %%p2mem, m13 372 mova %%q2mem, m14 373 %define m13 %%p2mem 374 %define m14 %%q2mem 375 %if %1 != 6 376 mova %%q3mem, m15 377 %define m15 %%q3mem 378 %endif 379 %endif 380%endif 381%else ; %2 == h 382 ; load lines 383%if %1 == 4 384 ; transpose 4x16 385 movd m7, [dstq+strideq*0-2] 386 movd m3, [dstq+strideq*1-2] 387 movd m4, [dstq+strideq*2-2] 388 movd m5, [dstq+stride3q -2] 389 lea tmpq, [dstq+strideq*4] 390 punpcklbw m7, m3 391 punpcklbw m4, m5 392 movd m3, [tmpq+strideq*0-2] 393 movd m1, [tmpq+strideq*1-2] 394 movd m5, [tmpq+strideq*2-2] 395 movd m6, [tmpq+stride3q -2] 396 lea tmpq, [tmpq+strideq*4] 397 punpcklbw m3, m1 398 punpcklbw m5, m6 399 movd m0, [tmpq+strideq*0-2] 400 movd m1, [tmpq+strideq*1-2] 401 punpcklbw m0, m1 402 movd m1, [tmpq+strideq*2-2] 403 movd m2, [tmpq+stride3q -2] 404 punpcklbw m1, m2 405 punpcklqdq m7, m0 406 punpcklqdq m4, m1 407 lea tmpq, [tmpq+strideq*4] 408 movd m0, [tmpq+strideq*0-2] 409 movd m1, [tmpq+strideq*1-2] 410 punpcklbw m0, m1 411 movd m1, [tmpq+strideq*2-2] 412 movd m2, [tmpq+stride3q -2] 413 punpcklbw m1, m2 414 punpcklqdq m3, m0 415 punpcklqdq m5, m1 416 ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9 417 ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13 418 ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11 419 ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15 420 punpcklwd m6, m7, m4 421 punpckhwd m7, m4 422 punpcklwd m4, m3, m5 423 punpckhwd m3, m5 424 ; xm6: A0-3,B0-3,C0-3,D0-3 425 ; xm7: A8-11,B8-11,C8-11,D8-11 426 ; xm4: A4-7,B4-7,C4-7,D4-7 427 ; xm3: A12-15,B12-15,C12-15,D12-15 428 punpckldq m5, m6, m4 429 punpckhdq m6, m4 430 punpckldq m4, m7, m3 431 punpckhdq m7, m3 432 ; xm5: A0-7,B0-7 433 ; xm6: C0-7,D0-7 434 ; xm4: A8-15,B8-15 435 ; xm7: C8-15,D8-15 436 punpcklqdq m3, m5, m4 437 punpckhqdq m5, m5, m4 438 punpcklqdq m4, m6, m7 439 punpckhqdq m6, m7 440 ; xm3: A0-15 441 ; xm5: B0-15 442 ; xm4: C0-15 443 ; xm6: D0-15 444 SWAP 4, 5 445%elif %1 == 6 || %1 == 8 446 ; transpose 8x16 447 movq m7, [dstq+strideq*0-%1/2] 448 movq m3, [dstq+strideq*1-%1/2] 449 movq m4, [dstq+strideq*2-%1/2] 450 movq m5, [dstq+stride3q -%1/2] 451 lea tmpq, [dstq+strideq*8] 452 punpcklbw m7, m3 453 punpcklbw m4, m5 454 movq m3, [tmpq+strideq*0-%1/2] 455 movq m1, [tmpq+strideq*1-%1/2] 456 movq m5, [tmpq+strideq*2-%1/2] 457 movq m6, [tmpq+stride3q -%1/2] 458 lea tmpq, [dstq+strideq*4] 459 punpcklbw m3, m1 460 punpcklbw m5, m6 461 movq m6, [tmpq+strideq*0-%1/2] 462 movq m0, [tmpq+strideq*1-%1/2] 463 movq m1, [tmpq+strideq*2-%1/2] 464 movq m2, [tmpq+stride3q -%1/2] 465 lea tmpq, [tmpq+strideq*8] 466 punpcklbw m6, m0 467 punpcklbw m1, m2 468 movq m2, [tmpq+strideq*2-%1/2] 469 movq m0, [tmpq+stride3q -%1/2] 470 punpcklbw m2, m0 471%if ARCH_X86_64 472 SWAP m15, m2 473%else 474 %define m15 [esp+3*16] 475 mova m15, m2 476%endif 477 movq m0, [tmpq+strideq*0-%1/2] 478 movq m2, [tmpq+strideq*1-%1/2] 479 punpcklbw m0, m2 480 ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 481 ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 482 ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 483 ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 484 ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 485 ; xm0: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 486 ; xm1: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 487 ; xm2: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 488 punpcklwd m2, m7, m4 489 punpckhwd m7, m4 490 punpcklwd m4, m3, m5 491 punpckhwd m3, m5 492 punpcklwd m5, m6, m1 493 punpckhwd m6, m1 494 punpcklwd m1, m0, m15 495 punpckhwd m0, m15 496%if ARCH_X86_64 497 SWAP m15, m0 498%else 499 mova m15, m0 500%endif 501 ; xm2: A0-3,B0-3,C0-3,D0-3 502 ; xm7: E0-3,F0-3,G0-3,H0-3 503 ; xm4: A8-11,B8-11,C8-11,D8-11 504 ; xm3: E8-11,F8-11,G8-11,H8-11 505 ; xm5: A4-7,B4-7,C4-7,D4-7 506 ; xm6: E4-7,F4-7,G4-7,H4-7 507 ; xm1: A12-15,B12-15,C12-15,D12-15 508 ; xm0: E12-15,F12-15,G12-15,H12-15 509 punpckldq m0, m2, m5 510 punpckhdq m2, m5 511 punpckldq m5, m7, m6 512%if %1 != 6 513 punpckhdq m7, m6 514%endif 515 punpckldq m6, m4, m1 516 punpckhdq m4, m1 517 punpckldq m1, m3, m15 518%if %1 != 6 519 punpckhdq m3, m15 520 %if ARCH_X86_64 521 SWAP m15, m3 522 %else 523 mova m15, m3 524 %endif 525%endif 526 ; xm0: A0-7,B0-7 527 ; xm2: C0-7,D0-7 528 ; xm5: E0-7,F0-7 529 ; xm7: G0-7,H0-7 530 ; xm6: A8-15,B8-15 531 ; xm4: C8-15,D8-15 532 ; xm1: E8-15,F8-15 533 ; xm3: G8-15,H8-15 534 punpcklqdq m3, m0, m6 535 punpckhqdq m0, m6 536 punpckhqdq m6, m2, m4 537 punpcklqdq m2, m4 538 punpcklqdq m4, m5, m1 539 punpckhqdq m5, m1 540%if %1 == 8 541 punpcklqdq m1, m7, m15 542 punpckhqdq m7, m15 543 ; xm3: A0-15 544 ; xm0: B0-15 545 ; xm2: C0-15 546 ; xm6: D0-15 547 ; xm4: E0-15 548 ; xm5: F0-15 549 ; xm1: G0-15 550 ; xm7: H0-15 551%if ARCH_X86_64 552 SWAP 11, 3, 2 553 SWAP 13, 0 554 SWAP 6, 5, 4 555 SWAP 14, 1 556 SWAP 15, 7 557 ; 3,0,2,6,4,5,1,7 -> 11,13,3,4,5,6,14,15 558 mova [rsp+21*16], m11 559 %define %%p3mem [rsp+21*16] 560%else 561 %define m11 [esp+26*16] 562 %define m13 [esp+27*16] 563 %define m14 [esp+28*16] 564 %define m15 [esp+29*16] 565 mova m11, m3 566 mova m13, m0 567 SWAP 3, 2 568 SWAP 6, 5, 4 569 mova m14, m1 570 mova m15, m7 571 %define %%p3mem [esp+26*16] 572%endif 573%else 574 %if ARCH_X86_64 575 SWAP 13, 3, 0 576 SWAP 14, 5, 6, 4, 2 577 ; 3,0,2,6,4,5 -> 13,3,4,5,6,14 578 %else 579 %define m13 %%p2mem 580 %define m14 %%q2mem 581 mova m13, m3 582 mova m14, m5 583 SWAP 3, 0 584 SWAP 5, 6, 4, 2 585 ; 0,2,6,4 -> 3,4,5,6 586 %endif 587%endif 588%else 589%if ARCH_X86_64 590 mova [rsp+20*16], m12 591%endif 592 ; load and 16x16 transpose. We only use 14 pixels but we'll need the 593 ; remainder at the end for the second transpose 594%if ARCH_X86_32 595 %xdefine m8 m0 596 %xdefine m9 m1 597 %xdefine m10 m2 598 %xdefine m11 m3 599 %xdefine m12 m4 600 %xdefine m13 m5 601 %xdefine m14 m6 602 %xdefine m15 m7 603 lea tmpq, [dstq+strideq*8] 604 movu m8, [tmpq+strideq*0-8] 605 movu m9, [tmpq+strideq*1-8] 606 movu m10, [tmpq+strideq*2-8] 607 movu m11, [tmpq+stride3q -8] 608 lea tmpq, [tmpq+strideq*4] 609 movu m12, [tmpq+strideq*0-8] 610 movu m13, [tmpq+strideq*1-8] 611 movu m14, [tmpq+strideq*2-8] 612 movu m15, [tmpq+stride3q -8] 613 mova [esp+ 8*16], m8 614 mova [esp+ 9*16], m9 615 mova [esp+10*16], m10 616 mova [esp+11*16], m11 617 mova [esp+12*16], m12 618 mova [esp+13*16], m13 619 mova [esp+14*16], m14 620 mova [esp+15*16], m15 621%endif 622 movu m0, [dstq+strideq*0-8] 623 movu m1, [dstq+strideq*1-8] 624 movu m2, [dstq+strideq*2-8] 625 movu m3, [dstq+stride3q -8] 626 lea tmpq, [dstq+strideq*4] 627 movu m4, [tmpq+strideq*0-8] 628 movu m5, [tmpq+strideq*1-8] 629 movu m6, [tmpq+strideq*2-8] 630 movu m7, [tmpq+stride3q -8] 631 lea tmpq, [tmpq+strideq*4] 632%if ARCH_X86_64 633 movu m8, [tmpq+strideq*0-8] 634 movu m9, [tmpq+strideq*1-8] 635 movu m10, [tmpq+strideq*2-8] 636 movu m11, [tmpq+stride3q -8] 637 lea tmpq, [tmpq+strideq*4] 638 movu m12, [tmpq+strideq*0-8] 639 movu m13, [tmpq+strideq*1-8] 640 movu m14, [tmpq+strideq*2-8] 641 movu m15, [tmpq+stride3q -8] 642%endif 643 644%if ARCH_X86_64 645 TRANSPOSE_16X16B 0, [rsp+11*16] 646 mova [rsp+12*16], m1 647 mova [rsp+13*16], m2 648 mova [rsp+14*16], m3 649 mova [rsp+15*16], m12 650 mova [rsp+16*16], m13 651 mova [rsp+17*16], m14 652 mova [rsp+18*16], m15 653 ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 654 SWAP 12, 4, 7 655 SWAP 13, 5, 8 656 SWAP 3, 6, 9 657 SWAP 10, 14 658 SWAP 11, 15 659 mova [rsp+21*16], m12 660 %define %%p3mem [rsp+21*16] 661 mova m12, [rsp+20*16] 662%else 663 TRANSPOSE_16X16B 0, [esp+16*16] 664 %define %%p3mem [esp+26*16] 665 %define m11 %%p3mem 666 %define m13 %%p2mem 667 %define m14 %%q2mem 668 %define m15 %%q3mem 669%endif 670%endif ; if 4 elif 6 or 8 else 16 671%endif ; if v else h 672 673 ; load L/E/I/H 674%if ARCH_X86_32 675 mov l_strideq, l_stridem 676%endif 677%ifidn %2, v 678 movu m1, [lq] 679 movu m0, [lq+l_strideq] 680%else 681 %if ARCH_X86_32 682 lea l_stride3q, [l_strideq*3] 683 %endif 684 movq xm1, [lq] 685 movq xm2, [lq+l_strideq*2] 686 movhps xm1, [lq+l_strideq] 687 movhps xm2, [lq+l_stride3q] 688 shufps m0, m1, m2, q3131 689 shufps m1, m2, q2020 690 %if ARCH_X86_32 691 lea stride3q, [strideq*3] 692 %endif 693%endif 694 695%if ARCH_X86_32 696 %ifidn %2, v 697 mov lutd, lutm 698 %endif 699%endif 700 pxor m2, m2 701 pcmpeqb m7, m2, m0 702 pand m1, m7 703 por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] 704 pshufb m0, [PIC_sym(pb_4x0_4x4_4x8_4x12)] ; l[x][1] 705 pcmpeqb m2, m0 ; !L 706 psrlq m7, m0, [lutq+128] 707 pand m7, [PIC_sym(pb_63)] 708 pminub m7, minlvl 709 pmaxub m7, [PIC_sym(pb_1)] ; I 710 pand m1, m0, [PIC_sym(pb_240)] 711 psrlq m1, 4 ; H 712 paddb m0, [PIC_sym(pb_2)] 713 paddb m0, m0 714 paddb m0, m7 ; E 715 pxor m1, [PIC_sym(pb_128)] 716 pxor m7, [PIC_sym(pb_128)] 717 pxor m0, [PIC_sym(pb_128)] 718 SWAP 2, 7 719 720%if ARCH_X86_64 721 SWAP 0, 8 722 SWAP 2, 10 723%else 724 %ifidn %2, v 725 mov mstrideq, strideq 726 neg mstrideq 727 %if %1 == 4 728 lea tmpq, [dstq+mstrideq*2] 729 %elif %1 == 6 || %1 == 8 730 lea tmpq, [dstq+mstrideq*4] 731 %endif 732 %endif 733 mova [esp+3*16], m0 734 mova [esp+4*16], m2 735%endif 736 737 ABSSUB m0, m3, m4, m2 ; abs(p1-p0) 738 pmaxub m0, m7 739 ABSSUB m2, m5, m6, m7 ; abs(q1-q0) 740 pmaxub m0, m2 741%if %1 == 4 742 pxor m0, [PIC_sym(pb_128)] 743 pcmpgtb m7, m0, m1 ; hev 744 %if ARCH_X86_64 745 SWAP 7, 11 746 %else 747 mova [esp+5*16], m7 748 %endif 749%else 750 pxor m7, m0, [PIC_sym(pb_128)] 751 pcmpgtb m7, m1 ; hev 752%if ARCH_X86_64 753 SWAP 7, 11 754%else 755 mova [esp+5*16], m7 756%endif 757 758%if %1 == 6 759 ABSSUB m1, m13, m4, m7 ; abs(p2-p0) 760 pmaxub m1, m0 761%else 762 mova m2, %%p3mem 763 ABSSUB m1, m2, m4, m7 ; abs(p3-p0) 764 pmaxub m1, m0 765 ABSSUB m7, m13, m4, m2 ; abs(p2-p0) 766 pmaxub m1, m7 767%endif 768 ABSSUB m7, m5, m14, m2 ; abs(p2-p0) 769 pmaxub m1, m7 770%if %1 != 6 771 ABSSUB m7, m5, m15, m2 ; abs(q3-q0) 772 pmaxub m1, m7 773%endif 774 pxor m1, [PIC_sym(pb_128)] 775 pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8in 776%if ARCH_X86_64 777 SWAP 1, 9 778%else 779 mova [esp+6*16], m1 780%endif 781 782%if %1 == 6 783 ABSSUB m7, m13, m3, m1 ; abs(p2-p1) 784%else 785 mova m2, %%p3mem 786 ABSSUB m7, m2, m13, m1 ; abs(p3-p2) 787 ABSSUB m2, m13, m3, m1 ; abs(p2-p1) 788 pmaxub m7, m2 789 ABSSUB m2, m14, m15, m1 ; abs(q3-q2) 790 pmaxub m7, m2 791%endif 792 ABSSUB m2, m14, m6, m1 ; abs(q2-q1) 793 pmaxub m7, m2 794%if ARCH_X86_32 795 %define m12 m1 796 mova m12, maskmem 797%endif 798 pand m2, m12, mask1 799 pcmpeqd m2, m12 800 pand m7, m2 ; only apply fm-wide to wd>4 blocks 801 pmaxub m0, m7 802 803 pxor m0, [PIC_sym(pb_128)] 804%endif ; %if %1 == 4 else 805%if ARCH_X86_64 806 SWAP 2, 10 807 pcmpgtb m0, m2 808%else 809 pcmpgtb m0, [esp+4*16] 810%endif 811 812 ABSSUB m1, m3, m6, m7 ; abs(p1-q1) 813 ABSSUB m7, m4, m5, m2 ; abs(p0-q0) 814 paddusb m7, m7 815 pand m1, [PIC_sym(pb_254)] 816 psrlq m1, 1 817 paddusb m1, m7 ; abs(p0-q0)*2+(abs(p1-q1)>>1) 818 pxor m1, [PIC_sym(pb_128)] 819%if ARCH_X86_64 820 pcmpgtb m1, m8 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E 821%else 822 pcmpgtb m1, [esp+3*16] 823%endif 824 por m0, m1 825 826%if %1 == 16 827%if ARCH_X86_64 828 SWAP 0, 8 829%else 830 mova [esp+3*16], m0 831%endif 832%ifidn %2, v 833 lea tmpq, [dstq+mstrideq*8] 834 mova m0, [tmpq+strideq*1] 835%else 836 mova m0, [rsp+12*16] 837%endif 838 ABSSUB m1, m0, m4, m2 839%ifidn %2, v 840 mova m0, [tmpq+strideq*2] 841%else 842 mova m0, [rsp+13*16] 843%endif 844 ABSSUB m2, m0, m4, m7 845 pmaxub m1, m2 846%ifidn %2, v 847 mova m0, [tmpq+stride3q] 848%else 849 mova m0, [rsp+14*16] 850%endif 851 ABSSUB m2, m0, m4, m7 852 pmaxub m1, m2 853%ifidn %2, v 854 lea tmpq, [dstq+strideq*4] 855 mova m0, [tmpq+strideq*0] 856%else 857 mova m0, [rsp+15*16] 858%endif 859 ABSSUB m2, m0, m5, m7 860 pmaxub m1, m2 861%ifidn %2, v 862 mova m0, [tmpq+strideq*1] 863%else 864 mova m0, [rsp+16*16] 865%endif 866 ABSSUB m2, m0, m5, m7 867 pmaxub m1, m2 868%ifidn %2, v 869 mova m0, [tmpq+strideq*2] 870%else 871 mova m0, [rsp+17*16] 872%endif 873 ABSSUB m2, m0, m5, m7 874 pmaxub m1, m2 875 pxor m1, [PIC_sym(pb_128)] 876 pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8out 877%if ARCH_X86_64 878 por m1, m9 ; !flat8in | !flat8out 879%else 880 por m1, [esp+6*16] 881 %define m12 m7 882 mova m12, maskmem 883%endif 884 pand m2, m12, mask2 885 pcmpeqd m2, m12 886 pandn m1, m2 ; flat16 887%if ARCH_X86_64 888 pandn m2, m8, m1 ; flat16 & fm 889%else 890 pandn m2, [esp+3*16], m1 ; flat16 & fm 891 mova %%flat16mem, m2 892%endif 893 SWAP 1, 2 894 895 pand m2, m12, mask1 896 pcmpeqd m2, m12 897%if ARCH_X86_64 898 pandn m9, m2 ; flat8in 899 pandn m2, m8, m9 900 SWAP 2, 9 901%else 902 pandn m0, [esp+6*16], m2 903 pandn m2, [esp+3*16], m0 904 mova [esp+6*16], m2 905%endif 906 pand m2, m12, mask0 907 pcmpeqd m2, m12 908%if ARCH_X86_64 909 pandn m8, m2 910 pandn m2, m9, m8 ; fm & !flat8 & !flat16 911 SWAP 2, 8 912 pandn m2, m1, m9 ; flat8 & !flat16 913 SWAP 2, 9 914 SWAP 0, 8 915 SWAP 1, 10 916%else 917 pandn m0, [esp+3*16], m2 918 pandn m2, [esp+6*16], m0 919 SWAP 2, 0 920 pandn m2, m1, [esp+6*16] 921 mova %%flat8mem, m2 922%endif 923%elif %1 != 4 924 %if ARCH_X86_64 925 SWAP 1, 9 926 %else 927 %define m12 m7 928 mova m12, maskmem 929 mova m1, [esp+6*16] 930 %endif 931 pand m2, m12, mask1 932 pcmpeqd m2, m12 933 pandn m1, m2 934 pandn m2, m0, m1 ; flat8 & fm 935 pand m1, m12, mask0 936 pcmpeqd m1, m12 937 pandn m0, m1 938 pandn m1, m2, m0 ; fm & !flat8 939 SWAP 1, 2, 0 940 %if ARCH_X86_64 941 SWAP 1, 9 942 %else 943 mova %%flat8mem, m1 944 %endif 945%else 946%if ARCH_X86_32 947 %define m12 m1 948 mova m12, maskmem 949%endif 950 pand m2, m12, mask0 951 pcmpeqd m2, m12 952 pandn m0, m2 ; fm 953%endif 954 955 ; short filter 956 957 mova m1, [PIC_sym(pb_128)] 958%if ARCH_X86_64 959 SWAP 7, 11 960%else 961 mova m7, [esp+5*16] 962%endif 963 pxor m3, m1 964 pxor m6, m1 965 pxor m4, m1 966 pxor m5, m1 967 psubsb m1, m3, m6 ; iclip_diff(p1-q1) 968 pand m1, m7 ; f=iclip_diff(p1-q1)&hev 969 psubsb m2, m5, m4 970 paddsb m1, m2 971 paddsb m1, m2 972 paddsb m1, m2 ; f=iclip_diff(3*(q0-p0)+f) 973 mova m2, [PIC_sym(pb_16)] 974 pand m0, m1 ; f&=fm 975 paddsb m1, m0, [PIC_sym(pb_3)] 976 paddsb m0, [PIC_sym(pb_4)] 977 pand m1, [PIC_sym(pb_248)] 978 pand m0, [PIC_sym(pb_248)] 979 psrlq m1, 3 980 psrlq m0, 3 981 pxor m1, m2 982 pxor m0, m2 983 psubb m1, m2 ; f2 984 psubb m0, m2 ; f1 985 mova m2, [PIC_sym(pb_128)] 986 paddsb m4, m1 987 psubsb m5, m0 988 pxor m4, m2 989 pxor m5, m2 990 991 pxor m0, m2 992 pxor m1, m1 993 pavgb m0, m1 ; f=(f1+1)>>1 994 psubb m0, [PIC_sym(pb_64)] 995 pandn m7, m0 ; f&=!hev 996 paddsb m3, m7 997 psubsb m6, m7 998 pxor m3, m2 999 pxor m6, m2 1000 1001%if %1 == 16 1002 ; flat16 filter 1003%ifidn %2, v 1004 lea tmpq, [dstq+mstrideq*8] 1005 mova m0, [tmpq+strideq*1] ; p6 1006 mova m2, [tmpq+strideq*2] ; p5 1007 mova m7, [tmpq+stride3q] ; p4 1008%else 1009 mova m0, [rsp+12*16] 1010 mova m2, [rsp+13*16] 1011 mova m7, [rsp+14*16] 1012%endif 1013 1014%if ARCH_X86_64 1015 SWAP 1, 10 1016 mova %%flat8mem, m9 1017 mova %%q2mem, m14 1018 mova %%q3mem, m15 1019 SWAP 0, 8 1020 SWAP 1, 9 1021%else 1022 %ifidn %2, v 1023 mova [esp+17*16], m0 1024 mova [esp+19*16], m3 1025 mova [esp+21*16], m4 1026 mova [esp+22*16], m5 1027 mova [esp+23*16], m6 1028 %xdefine m11 m3 1029 %xdefine m14 m4 1030 %xdefine m15 m5 1031 %xdefine m10 m6 1032 %define m13 %%p2mem 1033 %define m8 [esp+17*16] 1034 %define m9 %%flat16mem 1035 %define m3 [esp+19*16] 1036 %define m4 [esp+21*16] 1037 %define m5 [esp+22*16] 1038 %define m6 [esp+23*16] 1039 %else 1040 mova [esp+31*16], m0 1041 mova [esp+32*16], m3 1042 mova [esp+33*16], m4 1043 mova [esp+34*16], m5 1044 mova [esp+35*16], m6 1045 %xdefine m11 m3 1046 %xdefine m14 m4 1047 %xdefine m15 m5 1048 %xdefine m10 m6 1049 %define m13 %%p2mem 1050 %define m8 [esp+31*16] 1051 %define m9 %%flat16mem 1052 %define m3 [esp+32*16] 1053 %define m4 [esp+33*16] 1054 %define m5 [esp+34*16] 1055 %define m6 [esp+35*16] 1056 %endif 1057%endif 1058 1059 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A 1060 ; write -6 1061 mova m11, %%p3mem 1062%if ARCH_X86_64 1063 punpcklbw m14, m8, m11 1064 punpckhbw m15, m8, m11 1065%else 1066 punpcklbw m14, m0, m11 1067 punpckhbw m15, m0, m11 1068%endif 1069%ifidn %2, v 1070 mova [rsp+5*16], m11 1071%endif 1072 pmaddubsw m10, m14, [PIC_sym(pb_7_1)] 1073 pmaddubsw m11, m15, [PIC_sym(pb_7_1)] ; p6*7+p3 1074 punpcklbw m0, m2, m7 1075 punpckhbw m1, m2, m7 1076 pmaddubsw m0, [PIC_sym(pb_2)] 1077 pmaddubsw m1, [PIC_sym(pb_2)] 1078 paddw m10, m0 1079 paddw m11, m1 ; p6*7+p5*2+p4*2+p3 1080 punpcklbw m0, m13, m3 1081 punpckhbw m1, m13, m3 1082 pmaddubsw m0, [PIC_sym(pb_1)] 1083 pmaddubsw m1, [PIC_sym(pb_1)] 1084 paddw m10, m0 1085 paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1 1086 punpcklbw m0, m4, m5 1087 punpckhbw m1, m4, m5 1088 pmaddubsw m0, [PIC_sym(pb_1)] 1089 pmaddubsw m1, [PIC_sym(pb_1)] 1090 paddw m10, m0 1091 paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 1092 pmulhrsw m0, m10, [PIC_sym(pw_2048)] 1093 pmulhrsw m1, m11, [PIC_sym(pw_2048)] 1094 packuswb m0, m1 1095 pand m0, m9 1096 pandn m1, m9, m2 1097 por m0, m1 1098%ifidn %2, v 1099 mova [tmpq+strideq*2], m0 ; p5 1100%else 1101 mova [rsp+13*16], m0 1102%endif 1103 1104 ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B 1105 ; write -5 1106 pmaddubsw m14, [PIC_sym(pb_m1_1)] 1107 pmaddubsw m15, [PIC_sym(pb_m1_1)] 1108 paddw m10, m14 1109 paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 1110 punpcklbw m0, m8, m6 1111 punpckhbw m1, m8, m6 1112 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1113 pmaddubsw m1, [PIC_sym(pb_m1_1)] 1114 mova [rsp+3*16], m0 1115 mova [rsp+4*16], m1 1116 paddw m10, m0 1117 paddw m11, m1 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 1118 pmulhrsw m0, m10, [PIC_sym(pw_2048)] 1119 pmulhrsw m1, m11, [PIC_sym(pw_2048)] 1120 packuswb m0, m1 1121 pand m0, m9 1122 pandn m1, m9, m7 1123 por m0, m1 1124%ifidn %2, v 1125 mova [tmpq+stride3q], m0 ; p4 1126%else 1127 mova [rsp+14*16], m0 1128%endif 1129 1130 ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C 1131 ; write -4 1132 mova m14, %%q2mem 1133 punpcklbw m0, m8, m13 1134 punpckhbw m1, m8, m13 1135 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1136 pmaddubsw m1, [PIC_sym(pb_m1_1)] 1137 paddw m10, m0 1138 paddw m11, m1 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 1139 punpcklbw m0, m2, m14 1140 punpckhbw m2, m14 1141 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1142 pmaddubsw m2, [PIC_sym(pb_m1_1)] 1143 mova [rsp+1*16], m0 1144 paddw m10, m0 1145 paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 1146 pmulhrsw m0, m10, [PIC_sym(pw_2048)] 1147 pmulhrsw m1, m11, [PIC_sym(pw_2048)] 1148 packuswb m0, m1 1149 pand m0, m9 1150 pandn m1, m9, %%p3mem 1151 por m0, m1 1152%ifidn %2, v 1153 mova [tmpq+strideq*4], m0 ; p3 1154%else 1155 mova [rsp+19*16], m0 1156%endif 1157 1158 ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D 1159 ; write -3 1160 mova m15, %%q3mem 1161 punpcklbw m0, m8, m3 1162 punpckhbw m1, m8, m3 1163 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1164 pmaddubsw m1, [PIC_sym(pb_m1_1)] 1165 paddw m10, m0 1166 paddw m11, m1 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 1167 punpcklbw m0, m7, m15 1168 punpckhbw m7, m15 1169 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1170 pmaddubsw m7, [PIC_sym(pb_m1_1)] 1171 mova [rsp+2*16], m0 1172%if ARCH_X86_32 1173 %ifidn %2, v 1174 mova [esp+24*16], m7 1175 %else 1176 mova [esp+36*16], m7 1177 %endif 1178%endif 1179 paddw m10, m0 1180 paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 1181 pmulhrsw m0, m10, [PIC_sym(pw_2048)] 1182 pmulhrsw m1, m11, [PIC_sym(pw_2048)] 1183 packuswb m0, m1 1184 pand m0, m9 1185 pandn m1, m9, m13 1186 por m0, m1 1187 mova [rsp+6*16], m0 ; don't clobber p2/m13 since we need it in F 1188 1189 ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E 1190 ; write -2 1191 punpcklbw m0, m8, m4 1192 punpckhbw m1, m8, m4 1193 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1194 pmaddubsw m1, [PIC_sym(pb_m1_1)] 1195 paddw m10, m0 1196 paddw m11, m1 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 1197%if ARCH_X86_64 1198 SWAP 7, 8 1199%endif 1200%ifidn %2, v 1201 mova m1, [dstq+strideq*4] ; q4 1202 mova m7, [rsp+5*16] ; (pre-filter) p3 1203%else 1204 mova m1, [rsp+15*16] 1205 mova m7, %%p3mem ; (pre-filter) p3 1206%endif 1207 punpcklbw m0, m1, m7 1208 punpckhbw m1, m1, m7 1209 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1210 pmaddubsw m1, [PIC_sym(pb_m1_1)] 1211 mova [rsp+7*16], m0 1212 mova [rsp+5*16], m1 1213 psubw m10, m0 1214 psubw m11, m1 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 1215 pmulhrsw m0, m10, [PIC_sym(pw_2048)] 1216 pmulhrsw m1, m11, [PIC_sym(pw_2048)] 1217 packuswb m0, m1 1218 pand m0, m9 1219 pandn m1, m9, m3 1220 por m0, m1 1221 mova [rsp+8*16], m0 ; don't clobber p1/m3 since we need it in G 1222 1223 ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F 1224 ; write -1 1225%ifidn %2, v 1226 mova m7, [tmpq+strideq*1] ; p6 1227 lea tmpq, [dstq+strideq*4] 1228 mova m1, [tmpq+strideq*1] ; q5 1229%else 1230 mova m7, [rsp+12*16] ; p6 1231 mova m1, [rsp+16*16] 1232%endif 1233 punpcklbw m0, m7, m5 1234 punpckhbw m7, m5 1235 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1236 pmaddubsw m7, [PIC_sym(pb_m1_1)] 1237 paddw m10, m0 1238 paddw m11, m7 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 1239 punpcklbw m7, m13, m1 1240 pmaddubsw m7, [PIC_sym(pb_m1_1)] 1241 mova [rsp+9*16], m7 1242 paddw m10, m7 1243%if ARCH_X86_64 1244 punpckhbw m13, m1 1245 mova m1, [rsp+6*16] 1246 SWAP 1, 13 1247%else 1248 punpckhbw m7, m13, m1 1249 mova m1, [esp+6*16] 1250 mova m13, m1 1251 SWAP 1, 7 1252%endif 1253 pmaddubsw m1, [PIC_sym(pb_m1_1)] 1254 mova [rsp+10*16], m1 1255 paddw m11, m1 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 1256 pmulhrsw m7, m10, [PIC_sym(pw_2048)] 1257 pmulhrsw m0, m11, [PIC_sym(pw_2048)] 1258 packuswb m7, m0 1259 pand m7, m9 1260 pandn m0, m9, m4 1261 por m7, m0 1262 mova [rsp+6*16], m7 ; don't clobber p0/m4 since we need it in H 1263 1264 ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G 1265 ; write +0 1266%ifidn %2, v 1267 mova m7, [tmpq+strideq*2] ; q6 1268%else 1269 mova m7, [rsp+17*16] 1270%endif 1271 paddw m10, [rsp+3*16] 1272 paddw m11, [rsp+4*16] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 1273 punpcklbw m0, m3, m7 1274 punpckhbw m1, m3, m7 1275%if ARCH_X86_64 1276 mova m3, [rsp+8*16] 1277%endif 1278 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1279 pmaddubsw m1, [PIC_sym(pb_m1_1)] 1280 mova [rsp+3*16], m0 1281 mova [rsp+4*16], m1 1282 paddw m10, m0 1283 paddw m11, m1 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 1284 pmulhrsw m0, m10, [PIC_sym(pw_2048)] 1285 pmulhrsw m1, m11, [PIC_sym(pw_2048)] 1286 packuswb m0, m1 1287 pand m0, m9 1288 pandn m1, m9, m5 1289 por m0, m1 1290%if ARCH_X86_32 1291 mova m1, [esp+8*16] 1292 mova m3, m1 1293%endif 1294 mova [rsp+8*16], m0 ; don't clobber q0/m5 since we need it in I 1295 1296 ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H 1297 ; write +1 1298 paddw m10, [rsp+1*16] 1299 paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 1300 punpcklbw m0, m4, m7 1301 punpckhbw m2, m4, m7 1302 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1303 pmaddubsw m2, [PIC_sym(pb_m1_1)] 1304 paddw m10, m0 1305 paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 1306%if ARCH_X86_64 1307 mova m4, [rsp+6*16] 1308%else 1309 %define m4 [esp+6*16] 1310%endif 1311 pmulhrsw m2, m10, [PIC_sym(pw_2048)] 1312 pmulhrsw m1, m11, [PIC_sym(pw_2048)] 1313 packuswb m2, m1 1314 pand m2, m9 1315 pandn m1, m9, m6 1316 por m2, m1 ; don't clobber q1/m6 since we need it in K 1317 1318 ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I 1319 ; write +2 1320 paddw m10, [rsp+2*16] 1321%if ARCH_X86_64 1322 SWAP 7, 8 1323 paddw m11, m7 1324%else 1325 mova m8, m7 1326 %ifidn %2, v 1327 paddw m11, [esp+24*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 1328 %else 1329 paddw m11, [esp+36*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 1330 %endif 1331%endif 1332 punpcklbw m0, m5, m8 1333 punpckhbw m1, m5, m8 1334%if ARCH_X86_64 1335 mova m5, [rsp+8*16] 1336%else 1337 %define m5 [esp+8*16] 1338%endif 1339 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1340 pmaddubsw m1, [PIC_sym(pb_m1_1)] 1341 paddw m10, m0 1342 paddw m11, m1 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 1343 pmulhrsw m7, m10, [PIC_sym(pw_2048)] 1344 pmulhrsw m1, m11, [PIC_sym(pw_2048)] 1345 packuswb m7, m1 1346 pand m7, m9 1347 pandn m1, m9, m14 1348 por m7, m1 ; don't clobber q2/m14 since we need it in K 1349 1350 ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J 1351 ; write +3 1352 psubw m10, [rsp+7*16] 1353 psubw m11, [rsp+5*16] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 1354 punpcklbw m0, m6, m8 1355 punpckhbw m1, m6, m8 1356 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1357 pmaddubsw m1, [PIC_sym(pb_m1_1)] 1358 paddw m10, m0 1359 paddw m11, m1 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 1360 pmulhrsw m0, m10, [PIC_sym(pw_2048)] 1361 pmulhrsw m1, m11, [PIC_sym(pw_2048)] 1362 packuswb m0, m1 1363 pand m0, m9 1364 pandn m1, m9, m15 1365 por m0, m1 1366%ifidn %2, v 1367 mova [tmpq+mstrideq], m0 ; q3 1368%else 1369 mova [rsp+20*16], m0 1370%endif 1371 1372 ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K 1373 ; write +4 1374 paddw m10, [rsp+ 9*16] 1375 paddw m11, [rsp+10*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 1376 punpcklbw m0, m14, m8 1377 punpckhbw m1, m14, m8 1378 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1379 pmaddubsw m1, [PIC_sym(pb_m1_1)] 1380 paddw m10, m0 1381 paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 1382 pmulhrsw m0, m10, [PIC_sym(pw_2048)] 1383 pmulhrsw m1, m11, [PIC_sym(pw_2048)] 1384 packuswb m0, m1 1385 pand m0, m9 1386%ifidn %2, v 1387 pandn m1, m9, [tmpq+strideq*0] 1388%else 1389 pandn m1, m9, [rsp+15*16] 1390%endif 1391 por m0, m1 1392%ifidn %2, v 1393 mova [tmpq+strideq*0], m0 ; q4 1394%else 1395 mova [rsp+15*16], m0 1396%endif 1397 1398 ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L 1399 ; write +5 1400 paddw m10, [rsp+3*16] 1401 paddw m11, [rsp+4*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 1402 punpcklbw m0, m15, m8 1403 punpckhbw m1, m15, m8 1404 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1405 pmaddubsw m1, [PIC_sym(pb_m1_1)] 1406 paddw m10, m0 1407 paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 1408 pmulhrsw m10, [PIC_sym(pw_2048)] 1409 pmulhrsw m11, [PIC_sym(pw_2048)] 1410 packuswb m10, m11 1411 pand m10, m9 1412%ifidn %2, v 1413 pandn m11, m9, [tmpq+strideq*1] 1414%else 1415 pandn m11, m9, [rsp+16*16] 1416%endif 1417 por m10, m11 1418%ifidn %2, v 1419 mova [tmpq+strideq*1], m10 ; q5 1420%else 1421 mova [rsp+16*16], m10 1422%endif 1423 1424%if ARCH_X86_64 1425 SWAP 0, 8 1426 SWAP 1, 9 1427 SWAP 14, 7 1428%else 1429 %xdefine m3 m11 1430 %xdefine m4 m14 1431 %xdefine m5 m15 1432 %xdefine m6 m10 1433 mova %%q2mem, m7 1434 %ifidn %2, v 1435 mova m3, [esp+19*16] 1436 %else 1437 mova m3, [esp+32*16] 1438 %endif 1439 mova m4, [esp+ 6*16] 1440 mova m5, [esp+ 8*16] 1441%endif 1442 SWAP m6, m2 1443 1444%if ARCH_X86_64 1445 mova m9, %%flat8mem 1446%endif 1447%ifidn %2, v 1448 lea tmpq, [dstq+mstrideq*4] 1449%endif 1450%endif ; if %1 == 16 1451%if %1 >= 8 1452 ; flat8 filter 1453%if ARCH_X86_32 1454 %define m9 %%flat8mem 1455 %define m11 m1 1456 %define m13 %%p2mem 1457 %define m14 %%q2mem 1458 %define m15 %%q3mem 1459%endif 1460 mova m11, %%p3mem 1461 punpcklbw m0, m11, m3 1462 punpcklbw m7, m13, m4 1463 pmaddubsw m2, m0, [PIC_sym(pb_3_1)] ; 3 * p3 + p1 1464 pmaddubsw m7, [PIC_sym(pb_2_1)] 1465 paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 1466 punpcklbw m7, m5, [PIC_sym(pb_4)] 1467 pmaddubsw m7, [PIC_sym(pb_1)] 1468 paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 1469 punpckhbw m1, m11, m3 1470 pmaddubsw m7, m1, [PIC_sym(pb_3_1)] ; 3 * p3 + p1 1471 punpckhbw m0, m13, m4 1472 pmaddubsw m0, [PIC_sym(pb_2_1)] 1473 paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 1474 punpckhbw m0, m5, [PIC_sym(pb_4)] 1475 pmaddubsw m0, [PIC_sym(pb_1)] 1476 paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 1477 psrlw m0, m2, 3 1478 psrlw m1, m7, 3 1479 packuswb m0, m1 1480 pand m0, m9 1481 pandn m1, m9, m13 1482 por m0, m1 ; p2 1483%ifidn %2, v 1484 mova [tmpq+strideq*1], m0 1485%else 1486 %if ARCH_X86_64 1487 SWAP 0, 10 1488 %else 1489 mova [esp+2*16], m0 1490 %endif 1491%endif 1492 1493%if ARCH_X86_32 1494 mova m11, %%p3mem 1495%endif 1496 punpcklbw m0, m11, m3 1497 punpckhbw m1, m11, m3 1498 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1499 pmaddubsw m1, [PIC_sym(pb_m1_1)] 1500 paddw m2, m0 1501 paddw m7, m1 1502 punpcklbw m0, m13, m6 1503 punpckhbw m1, m13, m6 1504 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1505 pmaddubsw m1, [PIC_sym(pb_m1_1)] 1506 paddw m2, m0 1507 paddw m7, m1 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 1508 psrlw m0, m2, 3 1509 psrlw m1, m7, 3 1510 packuswb m0, m1 1511 pand m0, m9 1512 pandn m1, m9, m3 1513 por m0, m1 ; p1 1514%ifidn %2, v 1515 mova [tmpq+strideq*2], m0 1516%else 1517 mova [rsp+0*16], m0 1518%endif 1519 1520%if ARCH_X86_32 1521 mova m11, %%p3mem 1522%endif 1523 punpcklbw m0, m11, m3 1524 punpckhbw m1, m11, m3 1525 pmaddubsw m0, [PIC_sym(pb_1)] 1526 pmaddubsw m1, [PIC_sym(pb_1)] 1527 psubw m2, m0 1528 psubw m7, m1 1529 punpcklbw m0, m4, m14 1530 punpckhbw m1, m4, m14 1531 pmaddubsw m0, [PIC_sym(pb_1)] 1532 pmaddubsw m1, [PIC_sym(pb_1)] 1533 paddw m2, m0 1534 paddw m7, m1 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 1535 psrlw m0, m2, 3 1536 psrlw m1, m7, 3 1537 packuswb m0, m1 1538 pand m0, m9 1539 pandn m1, m9, m4 1540 por m0, m1 ; p0 1541%ifidn %2, v 1542 mova [tmpq+stride3q], m0 1543%else 1544 mova [rsp+1*16], m0 1545%endif 1546 1547 punpcklbw m0, m5, m15 1548 punpckhbw m1, m5, m15 1549 pmaddubsw m0, [PIC_sym(pb_1)] 1550 pmaddubsw m1, [PIC_sym(pb_1)] 1551 paddw m2, m0 1552 paddw m7, m1 1553%if ARCH_X86_32 1554 mova m11, %%p3mem 1555%endif 1556 punpcklbw m0, m11, m4 1557 punpckhbw m11, m11, m4 1558 pmaddubsw m0, [PIC_sym(pb_1)] 1559 pmaddubsw m11, [PIC_sym(pb_1)] 1560 psubw m2, m0 1561 psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 1562 psrlw m0, m2, 3 1563 psrlw m11, m7, 3 1564 packuswb m0, m11 1565 pand m0, m9 1566 pandn m11, m9, m5 1567 por m11, m0 ; q0 1568%ifidn %2, v 1569 mova [dstq+strideq*0], m11 1570%elif ARCH_X86_32 1571 mova [esp+8*16], m11 1572%endif 1573 1574 punpcklbw m0, m5, m15 1575 punpckhbw m1, m5, m15 1576 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1577 pmaddubsw m1, [PIC_sym(pb_m1_1)] 1578 paddw m2, m0 1579 paddw m7, m1 1580 punpcklbw m0, m13, m6 1581 punpckhbw m1, m13, m6 1582 pmaddubsw m0, [PIC_sym(pb_m1_1)] 1583 pmaddubsw m1, [PIC_sym(pb_m1_1)] 1584 paddw m2, m0 1585 paddw m7, m1 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 1586 psrlw m0, m2, 3 1587 psrlw m1, m7, 3 1588 packuswb m0, m1 1589 pand m0, m9 1590 pandn m1, m9, m6 1591 por m0, m1 ; q1 1592%ifidn %2, v 1593 mova [dstq+strideq*1], m0 1594%else 1595 %if ARCH_X86_64 1596 SWAP 0, 13 1597 %else 1598 mova [esp+9*16], m0 1599 %endif 1600%endif 1601 1602 punpcklbw m0, m3, m6 1603 punpckhbw m1, m3, m6 1604 pmaddubsw m0, [PIC_sym(pb_1)] 1605 pmaddubsw m1, [PIC_sym(pb_1)] 1606 psubw m2, m0 1607 psubw m7, m1 1608 punpcklbw m0, m14, m15 1609 punpckhbw m1, m14, m15 1610 pmaddubsw m0, [PIC_sym(pb_1)] 1611 pmaddubsw m1, [PIC_sym(pb_1)] 1612 paddw m2, m0 1613 paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 1614 psrlw m2, 3 1615 psrlw m7, 3 1616 packuswb m2, m7 1617 pand m2, m9 1618 pandn m7, m9, m14 1619 por m2, m7 ; q2 1620%ifidn %2, v 1621 mova [dstq+strideq*2], m2 1622%else 1623 mova m0, [rsp+0*16] 1624%if %1 == 8 1625 mova m1, [rsp+1*16] 1626 mova m4, %%p3mem 1627 1628%if ARCH_X86_32 1629 %define m10 [esp+2*16] 1630 %define m11 [esp+8*16] 1631 %define m13 [esp+9*16] 1632%endif 1633 1634 ; 16x8 transpose 1635 punpcklbw m3, m4, m10 1636 punpckhbw m4, m10 1637 punpcklbw m5, m0, m1 1638 punpckhbw m0, m1 1639 punpcklbw m1, m11, m13 1640 punpckhbw m6, m11, m13 1641 punpcklbw m7, m2, m15 1642 punpckhbw m2, m15 1643%if ARCH_X86_64 1644 SWAP 2, 15 1645%else 1646 mova m15, m2 1647%endif 1648 1649 punpcklwd m2, m3, m5 1650 punpckhwd m3, m5 1651 punpcklwd m5, m4, m0 1652 punpckhwd m4, m0 1653 punpcklwd m0, m1, m7 1654 punpckhwd m1, m7 1655 punpcklwd m7, m6, m15 1656 punpckhwd m6, m15 1657%if ARCH_X86_64 1658 SWAP 6, 15 1659%else 1660 mova m15, m6 1661%endif 1662 1663 punpckldq m6, m2, m0 1664 punpckhdq m2, m0 1665 punpckldq m0, m3, m1 1666 punpckhdq m3, m1 1667 punpckldq m1, m5, m7 1668 punpckhdq m5, m7 1669 punpckldq m7, m4, m15 1670 punpckhdq m4, m15 1671 1672 ; write 8x16 1673 movq [dstq+strideq*0-4], xm6 1674 movhps [dstq+strideq*1-4], xm6 1675 movq [dstq+strideq*2-4], xm2 1676 movhps [dstq+stride3q -4], xm2 1677 lea dstq, [dstq+strideq*4] 1678 movq [dstq+strideq*0-4], xm0 1679 movhps [dstq+strideq*1-4], xm0 1680 movq [dstq+strideq*2-4], xm3 1681 movhps [dstq+stride3q -4], xm3 1682 lea dstq, [dstq+strideq*4] 1683 movq [dstq+strideq*0-4], xm1 1684 movhps [dstq+strideq*1-4], xm1 1685 movq [dstq+strideq*2-4], xm5 1686 movhps [dstq+stride3q -4], xm5 1687 lea dstq, [dstq+strideq*4] 1688 movq [dstq+strideq*0-4], xm7 1689 movhps [dstq+strideq*1-4], xm7 1690 movq [dstq+strideq*2-4], xm4 1691 movhps [dstq+stride3q -4], xm4 1692 lea dstq, [dstq+strideq*4] 1693%else 1694 ; 16x16 transpose and store 1695 SWAP 6, 0 1696 SWAP 7, 1 1697 %if ARCH_X86_64 1698 SWAP 5, 10, 2 1699 SWAP 8, 11 1700 SWAP 9, 13 1701 mova [rsp+21*16], m12 1702 %else 1703 mova [esp+10*16], m2 1704 %xdefine m8 m0 1705 %xdefine m9 m1 1706 %xdefine m10 m2 1707 %xdefine m11 m3 1708 %xdefine m12 m4 1709 %xdefine m13 m5 1710 %xdefine m14 m6 1711 %xdefine m15 m7 1712 %endif 1713 mova m0, [rsp+11*16] 1714 mova m1, [rsp+12*16] 1715 mova m2, [rsp+13*16] 1716 mova m3, [rsp+14*16] 1717 mova m4, [rsp+19*16] 1718%if ARCH_X86_64 1719 mova m7, [rsp+ 1*16] 1720 mova m11, [rsp+20*16] 1721 mova m12, [rsp+15*16] 1722 mova m13, [rsp+16*16] 1723 mova m14, [rsp+17*16] 1724 TRANSPOSE_16X16B 1, [rsp+18*16] 1725%else 1726 mova m5, [esp+ 2*16] 1727 TRANSPOSE_16X16B 1, [esp+32*16] 1728 mov tmpq, dstq 1729 lea dstq, [dstq+strideq*8] 1730%endif 1731 movu [dstq+strideq*0-8], xm0 1732 movu [dstq+strideq*1-8], xm1 1733 movu [dstq+strideq*2-8], xm2 1734 movu [dstq+stride3q -8], xm3 1735 lea dstq, [dstq+strideq*4] 1736 movu [dstq+strideq*0-8], xm4 1737 movu [dstq+strideq*1-8], xm5 1738 movu [dstq+strideq*2-8], xm6 1739 movu [dstq+stride3q -8], xm7 1740%if ARCH_X86_64 1741 lea dstq, [dstq+strideq*4] 1742%else 1743 %xdefine m8 m0 1744 %xdefine m9 m1 1745 %xdefine m10 m2 1746 %xdefine m11 m3 1747 %xdefine m12 m4 1748 %xdefine m13 m5 1749 %xdefine m14 m6 1750 %xdefine m15 m7 1751 mova m8, [esp+11*16] 1752 mova m9, [esp+12*16] 1753 mova m10, [esp+13*16] 1754 mova m11, [esp+14*16] 1755 mova m12, [esp+26*16] 1756 mova m13, [esp+27*16] 1757 mova m14, [esp+ 0*16] 1758 mova m15, [esp+ 1*16] 1759 mov dstq, tmpq 1760%endif 1761 movu [dstq+strideq*0-8], xm8 1762 movu [dstq+strideq*1-8], xm9 1763 movu [dstq+strideq*2-8], xm10 1764 movu [dstq+stride3q -8], xm11 1765 lea dstq, [dstq+strideq*4] 1766 movu [dstq+strideq*0-8], xm12 1767 movu [dstq+strideq*1-8], xm13 1768 movu [dstq+strideq*2-8], xm14 1769 movu [dstq+stride3q -8], xm15 1770 lea dstq, [dstq+strideq*4] 1771%if ARCH_X86_32 1772 lea dstq, [dstq+strideq*8] 1773%else 1774 mova m12, [rsp+21*16] 1775%endif 1776 1777%endif ; if %1 == 8 1778%endif ; ifidn %2, v 1779%elif %1 == 6 1780 ; flat6 filter 1781%if ARCH_X86_32 1782 mova [esp+3*16], m3 1783 mova [esp+4*16], m4 1784 mova [esp+5*16], m5 1785 mova [esp+6*16], m6 1786 %xdefine m8 m3 1787 %xdefine m10 m4 1788 %xdefine m11 m5 1789 %xdefine m15 m6 1790 %define m3 [esp+3*16] 1791 %define m4 [esp+4*16] 1792 %define m5 [esp+5*16] 1793 %define m6 [esp+6*16] 1794 %define m9 %%flat8mem 1795 %define m13 %%p2mem 1796 %define m14 %%q2mem 1797%endif 1798 1799 punpcklbw m8, m13, m5 1800 punpckhbw m11, m13, m5 1801 pmaddubsw m0, m8, [PIC_sym(pb_3_1)] 1802 pmaddubsw m1, m11, [PIC_sym(pb_3_1)] 1803 punpcklbw m7, m4, m3 1804 punpckhbw m10, m4, m3 1805 pmaddubsw m2, m7, [PIC_sym(pb_2)] 1806 pmaddubsw m15, m10, [PIC_sym(pb_2)] 1807 paddw m0, m2 1808 paddw m1, m15 1809 pmulhrsw m2, m0, [PIC_sym(pw_4096)] 1810 pmulhrsw m15, m1, [PIC_sym(pw_4096)] 1811 packuswb m2, m15 1812 pand m2, m9 1813 pandn m15, m9, m3 1814 por m2, m15 1815%ifidn %2, v 1816 mova [tmpq+strideq*2], m2 ; p1 1817%elif ARCH_X86_32 1818 mova [esp+11*16], m2 1819%endif 1820 1821 pmaddubsw m8, [PIC_sym(pb_m1_1)] 1822 pmaddubsw m11, [PIC_sym(pb_m1_1)] 1823 paddw m0, m8 1824 paddw m1, m11 1825 punpcklbw m8, m13, m6 1826 punpckhbw m11, m13, m6 1827%if ARCH_X86_64 1828 SWAP 2, 13 1829%endif 1830 pmaddubsw m8, [PIC_sym(pb_m1_1)] 1831 pmaddubsw m11, [PIC_sym(pb_m1_1)] 1832 paddw m0, m8 1833 paddw m1, m11 1834 pmulhrsw m2, m0, [PIC_sym(pw_4096)] 1835 pmulhrsw m15, m1, [PIC_sym(pw_4096)] 1836 packuswb m2, m15 1837 pand m2, m9 1838 pandn m15, m9, m4 1839 por m2, m15 1840%ifidn %2, v 1841 mova [tmpq+stride3q], m2 ; p0 1842%elif ARCH_X86_32 1843 mova [esp+8*16], m2 1844%endif 1845 1846 paddw m0, m8 1847 paddw m1, m11 1848 punpcklbw m8, m3, m14 1849 punpckhbw m11, m3, m14 1850%if ARCH_X86_64 1851 SWAP 2, 14 1852%endif 1853 pmaddubsw m2, m8, [PIC_sym(pb_m1_1)] 1854 pmaddubsw m15, m11, [PIC_sym(pb_m1_1)] 1855 paddw m0, m2 1856 paddw m1, m15 1857 pmulhrsw m2, m0, [PIC_sym(pw_4096)] 1858 pmulhrsw m15, m1, [PIC_sym(pw_4096)] 1859 packuswb m2, m15 1860 pand m2, m9 1861 pandn m15, m9, m5 1862 por m2, m15 1863%ifidn %2, v 1864 mova [dstq+strideq*0], m2 ; q0 1865%endif 1866 1867 pmaddubsw m8, [PIC_sym(pb_m1_2)] 1868 pmaddubsw m11, [PIC_sym(pb_m1_2)] 1869 paddw m0, m8 1870 paddw m1, m11 1871 pmaddubsw m7, [PIC_sym(pb_m1_0)] 1872 pmaddubsw m10, [PIC_sym(pb_m1_0)] 1873 paddw m0, m7 1874 paddw m1, m10 1875 pmulhrsw m0, [PIC_sym(pw_4096)] 1876 pmulhrsw m1, [PIC_sym(pw_4096)] 1877 packuswb m0, m1 1878 pand m0, m9 1879 pandn m1, m9, m6 1880 por m0, m1 1881%if ARCH_X86_32 1882 %xdefine m3 m8 1883 %xdefine m4 m10 1884 %xdefine m5 m11 1885 %xdefine m6 m15 1886%endif 1887%ifidn %2, v 1888 mova [dstq+strideq*1], m0 ; q1 1889%else 1890 %if ARCH_X86_64 1891 SWAP 3, 13 1892 SWAP 4, 14 1893 %else 1894 mova m3, [esp+11*16] 1895 mova m4, [esp+ 8*16] 1896 %endif 1897 SWAP 5, 2 1898 SWAP 6, 0 1899 TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7 1900%endif 1901%else ; if %1 == 4 1902%ifidn %2, v 1903 mova [tmpq+strideq*0], m3 ; p1 1904 mova [tmpq+strideq*1], m4 ; p0 1905 mova [tmpq+strideq*2], m5 ; q0 1906 mova [tmpq+stride3q ], m6 ; q1 1907%else 1908 TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7 1909%endif 1910%endif 1911%if ARCH_X86_32 1912 %define m12 m12reg 1913%endif 1914%endmacro 1915 1916;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1917;; 32-bit PIC helpers ;; 1918;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1919 1920%if ARCH_X86_32 1921 %define PIC_base_offset $$ 1922 1923 %macro SETUP_PIC 0 ; PIC_reg 1924 %define PIC_reg r2 1925 %assign PIC_reg_stk_offset stack_size-gprsize*(1+copy_args*4) 1926 LEA PIC_reg, $$ 1927 %endmacro 1928 1929 %macro XCHG_PIC_REG 1 ; 0=mask 1=PIC_base 1930 %if %1 == 0 1931 mov [esp+PIC_reg_stk_offset], PIC_reg 1932 mov PIC_reg, maskm 1933 %else 1934 mov PIC_reg, [esp+PIC_reg_stk_offset] 1935 %endif 1936 %endmacro 1937 1938 %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) 1939 1940%else 1941 %macro XCHG_PIC_REG 1 1942 %endmacro 1943 %define PIC_sym(sym) (sym) 1944%endif 1945 1946;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1947 1948%if ARCH_X86_32 1949 %if STACK_ALIGNMENT < required_stack_alignment 1950 %assign copy_args 1 1951 %else 1952 %assign copy_args 0 1953 %endif 1954%endif 1955 1956%macro RELOC_ARGS 1 1957 %if copy_args 1958 %define maskm [esp+stack_size-gprsize*1] 1959 %define l_stridem [esp+stack_size-gprsize*2] 1960 %define lutm [esp+stack_size-gprsize*3] 1961 %define %1m [esp+stack_size-gprsize*4] 1962 mov r6d, r6m 1963 mov maskm, maskd 1964 mov lutm, lutd 1965 mov %1m, r6d 1966 %else 1967 %define %1m r6m 1968 %endif 1969%endmacro 1970 1971%if ARCH_X86_32 1972 %define tmpq r4 1973 %define mstrideq r5 1974 %define stride3q r6 1975 %define l_stride3q r6 1976%endif 1977 1978INIT_XMM ssse3 1979%if ARCH_X86_64 1980cglobal lpf_v_sb_y, 7, 11, 16, 16 * 15, \ 1981 dst, stride, mask, l, l_stride, lut, \ 1982 w, stride3, mstride, tmp, mask_bits 1983%else 1984cglobal lpf_v_sb_y, 6, 7, 8, -16 * (26 + copy_args), \ 1985 dst, stride, mask, l, l_stride, lut, mask_bits 1986 RELOC_ARGS w 1987 SETUP_PIC 1988 %define m12 m5 1989%endif 1990 shl l_strideq, 2 1991 sub lq, l_strideq 1992%if ARCH_X86_64 1993 mov mstrideq, strideq 1994 neg mstrideq 1995 lea stride3q, [strideq*3] 1996%else 1997 mov l_stridem, l_strided 1998%endif 1999 mov mask_bitsd, 0xf 2000 mova m12, [PIC_sym(pd_mask)] 2001 XCHG_PIC_REG 0 2002 movu m0, [maskq] 2003 pxor m4, m4 2004 movd m3, [lutq+136] 2005 pshufb m3, m4 2006 pshufd m2, m0, q2222 2007 pshufd m1, m0, q1111 2008 pshufd m0, m0, q0000 2009 por m1, m2 2010 por m0, m1 2011 mova [rsp+11*16], m0 2012 mova [rsp+12*16], m1 2013 mova [rsp+13*16], m2 2014 mova [rsp+14*16], m3 2015 2016%define maskmem [esp+15*16] 2017%define mask0 [rsp+11*16] 2018%define mask1 [rsp+12*16] 2019%define mask2 [rsp+13*16] 2020%define minlvl [rsp+14*16] 2021 2022.loop: 2023 test [maskq+8], mask_bitsd ; vmask[2] 2024 je .no_flat16 2025 2026%if ARCH_X86_32 2027 XCHG_PIC_REG 1 2028 mov [esp+25*16], mask_bitsd 2029 mova maskmem, m12 2030%endif 2031 FILTER 16, v 2032 jmp .end 2033 2034.no_flat16: 2035 test [maskq+4], mask_bitsd ; vmask[1] 2036 je .no_flat 2037 2038%if ARCH_X86_32 2039 XCHG_PIC_REG 1 2040 mov [esp+25*16], mask_bitsd 2041 mova maskmem, m12 2042%endif 2043 FILTER 8, v 2044 jmp .end 2045 2046.no_flat: 2047 test [maskq+0], mask_bitsd ; vmask[0] 2048 XCHG_PIC_REG 1 2049 je .no_filter 2050 2051%if ARCH_X86_32 2052 mov [esp+25*16], mask_bitsd 2053 mova maskmem, m12 2054%endif 2055 FILTER 4, v 2056 2057.end: 2058%if ARCH_X86_32 2059 mova m12, maskmem 2060 mov mask_bitsd, [esp+25*16] 2061%endif 2062.no_filter: 2063 pslld m12, 4 2064 shl mask_bitsd, 4 2065 add lq, 16 2066 add dstq, 16 2067%if ARCH_X86_64 2068 sub wd, 4 2069%else 2070 sub dword wm, 4 2071%endif 2072 XCHG_PIC_REG 0 2073 jg .loop 2074 RET 2075 2076INIT_XMM ssse3 2077%if ARCH_X86_64 2078cglobal lpf_h_sb_y, 7, 11, 16, 16 * 26, \ 2079 dst, stride, mask, l, l_stride, lut, \ 2080 h, stride3, l_stride3, tmp, mask_bits 2081%else 2082cglobal lpf_h_sb_y, 6, 7, 8, -16 * (39 + copy_args), \ 2083 dst, stride, mask, l, l_stride, lut, mask_bits 2084 RELOC_ARGS h 2085 SETUP_PIC 2086 %define m12 m5 2087%endif 2088 sub lq, 4 2089 shl l_strideq, 2 2090%if ARCH_X86_64 2091 lea stride3q, [strideq*3] 2092 lea l_stride3q, [l_strideq*3] 2093%else 2094 mov l_stridem, l_strided 2095%endif 2096 mov mask_bitsd, 0xf 2097 mova m12, [PIC_sym(pd_mask)] 2098 XCHG_PIC_REG 0 2099 movu m0, [maskq] 2100 pxor m4, m4 2101 movd m3, [lutq+136] 2102 pshufb m3, m4 2103 pshufd m2, m0, q2222 2104 pshufd m1, m0, q1111 2105 pshufd m0, m0, q0000 2106 por m1, m2 2107 por m0, m1 2108 mova [rsp+22*16], m0 2109 mova [rsp+23*16], m1 2110 mova [rsp+24*16], m2 2111 mova [rsp+25*16], m3 2112 2113%define maskmem [esp+37*16] 2114%define mask0 [rsp+22*16] 2115%define mask1 [rsp+23*16] 2116%define mask2 [rsp+24*16] 2117%define minlvl [rsp+25*16] 2118 2119.loop: 2120 test [maskq+8], mask_bitsd ; vmask[2] 2121 je .no_flat16 2122 2123%if ARCH_X86_32 2124 XCHG_PIC_REG 1 2125 mov [esp+38*16], mask_bitsd 2126 mova maskmem, m12 2127%endif 2128 FILTER 16, h 2129 jmp .end 2130 2131.no_flat16: 2132 test [maskq+4], mask_bitsd ; vmask[1] 2133 je .no_flat 2134 2135%if ARCH_X86_32 2136 XCHG_PIC_REG 1 2137 mov [esp+38*16], mask_bitsd 2138 mova maskmem, m12 2139%endif 2140 FILTER 8, h 2141 jmp .end 2142 2143.no_flat: 2144 test [maskq+0], mask_bitsd ; vmask[0] 2145 XCHG_PIC_REG 1 2146 je .no_filter 2147 2148%if ARCH_X86_32 2149 mov [esp+38*16], mask_bitsd 2150 mova maskmem, m12 2151%endif 2152 FILTER 4, h 2153 jmp .end 2154 2155.no_filter: 2156 lea dstq, [dstq+strideq*8] 2157 lea dstq, [dstq+strideq*8] 2158%if ARCH_X86_32 2159 jmp .end_noload 2160.end: 2161 mova m12, maskmem 2162 mov l_strideq, l_stridem 2163 mov mask_bitsd, [esp+38*16] 2164.end_noload: 2165%else 2166.end: 2167%endif 2168 lea lq, [lq+l_strideq*4] 2169 pslld m12, 4 2170 shl mask_bitsd, 4 2171%if ARCH_X86_64 2172 sub hd, 4 2173%else 2174 sub dword hm, 4 2175%endif 2176 XCHG_PIC_REG 0 2177 jg .loop 2178 RET 2179 2180INIT_XMM ssse3 2181%if ARCH_X86_64 2182cglobal lpf_v_sb_uv, 7, 11, 16, 3 * 16, \ 2183 dst, stride, mask, l, l_stride, lut, \ 2184 w, stride3, mstride, tmp, mask_bits 2185%else 2186cglobal lpf_v_sb_uv, 6, 7, 8, -16 * (12 + copy_args), \ 2187 dst, stride, mask, l, l_stride, lut, mask_bits 2188 RELOC_ARGS w 2189 SETUP_PIC 2190 %define m12 m4 2191%endif 2192 shl l_strideq, 2 2193 sub lq, l_strideq 2194%if ARCH_X86_64 2195 mov mstrideq, strideq 2196 neg mstrideq 2197 lea stride3q, [strideq*3] 2198%else 2199 mov l_stridem, l_strided 2200%endif 2201 mov mask_bitsd, 0xf 2202 mova m12, [PIC_sym(pd_mask)] 2203 XCHG_PIC_REG 0 2204 movq m0, [maskq] 2205 pxor m3, m3 2206 movd m2, [lutq+136] 2207 pshufb m2, m3 2208 pshufd m1, m0, q1111 2209 pshufd m0, m0, q0000 2210 por m0, m1 2211 mova [rsp+0*16], m0 2212 mova [rsp+1*16], m1 2213 mova [rsp+2*16], m2 2214 2215%define maskmem [esp+7*16] 2216%define mask0 [rsp+0*16] 2217%define mask1 [rsp+1*16] 2218%define minlvl [rsp+2*16] 2219 2220.loop: 2221 test [maskq+4], mask_bitsd ; vmask[1] 2222 je .no_flat 2223 2224%if ARCH_X86_32 2225 XCHG_PIC_REG 1 2226 mov [esp+11*16], mask_bitsd 2227 mova maskmem, m12 2228%endif 2229 FILTER 6, v 2230 jmp .end 2231 2232.no_flat: 2233 test [maskq+0], mask_bitsd ; vmask[1] 2234 XCHG_PIC_REG 1 2235 je .no_filter 2236 2237%if ARCH_X86_32 2238 mov [esp+11*16], mask_bitsd 2239 mova maskmem, m12 2240%endif 2241 FILTER 4, v 2242 2243.end: 2244%if ARCH_X86_32 2245 mova m12, maskmem 2246 mov mask_bitsd, [esp+11*16] 2247%endif 2248.no_filter: 2249 pslld m12, 4 2250 shl mask_bitsd, 4 2251 add lq, 16 2252 add dstq, 16 2253%if ARCH_X86_64 2254 sub wd, 4 2255%else 2256 sub dword wm, 4 2257%endif 2258 XCHG_PIC_REG 0 2259 jg .loop 2260 RET 2261 2262INIT_XMM ssse3 2263%if ARCH_X86_64 2264cglobal lpf_h_sb_uv, 7, 11, 16, 16 * 3, \ 2265 dst, stride, mask, l, l_stride, lut, \ 2266 h, stride3, l_stride3, tmp, mask_bits 2267%else 2268cglobal lpf_h_sb_uv, 6, 7, 8, -16 * (13 + copy_args), \ 2269 dst, stride, mask, l, l_stride, lut, mask_bits 2270 RELOC_ARGS h 2271 SETUP_PIC 2272 %define m12 m4 2273%endif 2274 sub lq, 4 2275 shl l_strideq, 2 2276%if ARCH_X86_64 2277 lea stride3q, [strideq*3] 2278 lea l_stride3q, [l_strideq*3] 2279%else 2280 mov l_stridem, l_strided 2281%endif 2282 mov mask_bitsd, 0xf 2283 mova m12, [PIC_sym(pd_mask)] 2284 XCHG_PIC_REG 0 2285 movq m0, [maskq] 2286 pxor m3, m3 2287 movd m2, [lutq+136] 2288 pshufb m2, m3 2289 pshufd m1, m0, q1111 2290 pshufd m0, m0, q0000 2291 por m0, m1 2292 mova [rsp+0*16], m0 2293 mova [rsp+1*16], m1 2294 mova [rsp+2*16], m2 2295 2296%define maskmem [esp+7*16] 2297%define mask0 [rsp+0*16] 2298%define mask1 [rsp+1*16] 2299%define minlvl [rsp+2*16] 2300 2301.loop: 2302 test [maskq+4], mask_bitsd ; vmask[1] 2303 je .no_flat 2304 2305%if ARCH_X86_32 2306 XCHG_PIC_REG 1 2307 mov [esp+12*16], mask_bitsd 2308 mova maskmem, m12 2309%endif 2310 FILTER 6, h 2311 jmp .end 2312 2313.no_flat: 2314 test [maskq+0], mask_bitsd ; vmask[1] 2315 XCHG_PIC_REG 1 2316 je .no_filter 2317 2318%if ARCH_X86_32 2319 mov [esp+12*16], mask_bitsd 2320 mova maskmem, m12 2321%endif 2322 FILTER 4, h 2323 jmp .end 2324 2325.no_filter: 2326 lea dstq, [dstq+strideq*8] 2327 lea dstq, [dstq+strideq*8] 2328%if ARCH_X86_32 2329 jmp .end_noload 2330.end: 2331 mova m12, maskmem 2332 mov l_strided, l_stridem 2333 mov mask_bitsd, [esp+12*16] 2334.end_noload: 2335%else 2336.end: 2337%endif 2338 lea lq, [lq+l_strideq*4] 2339 pslld m12, 4 2340 shl mask_bitsd, 4 2341%if ARCH_X86_64 2342 sub hd, 4 2343%else 2344 sub dword hm, 4 2345%endif 2346 XCHG_PIC_REG 0 2347 jg .loop 2348 RET 2349