1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29 30SECTION_RODATA 16 31 32deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 33 34deint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 35deint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 36 37%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1 38pw_%1_m%2: times 4 dw %1, -%2 39%if %3 != 2 40pw_%2_%1: times 4 dw %2, %1 41%endif 42%if %3 43pw_m%1_m%2: times 4 dw -%1, -%2 44%endif 45%endmacro 46 47;adst4 48pw_1321_3803: times 4 dw 1321, 3803 49pw_2482_m1321: times 4 dw 2482, -1321 50pw_3344_2482: times 4 dw 3344, 2482 51pw_3344_m3803: times 4 dw 3344, -3803 52pw_3344_m3344: times 4 dw 3344, -3344 53pw_0_3344 times 4 dw 0, 3344 54pw_m6688_m3803: times 4 dw -6688, -3803 55 56COEF_PAIR 2896, 2896 57COEF_PAIR 1567, 3784 58COEF_PAIR 799, 4017 59COEF_PAIR 3406, 2276 60COEF_PAIR 401, 4076 61COEF_PAIR 1931, 3612 62COEF_PAIR 3166, 2598 63COEF_PAIR 3920, 1189 64COEF_PAIR 3784, 1567, 1 65COEF_PAIR 995, 3973 66COEF_PAIR 1751, 3703 67COEF_PAIR 3513, 2106 68COEF_PAIR 3857, 1380 69COEF_PAIR 4017, 799, 1 70COEF_PAIR 201, 4091 71COEF_PAIR 2440, 3290 72COEF_PAIR 3035, 2751 73COEF_PAIR 4052, 601 74COEF_PAIR 2276, 3406, 1 75COEF_PAIR 4076, 401, 2 76COEF_PAIR 2598, 3166, 2 77COEF_PAIR 3612, 1931, 2 78COEF_PAIR 1189, 3920, 2 79 80pd_2048: times 4 dd 2048 81pw_2048: times 8 dw 2048 82pw_m2048: times 8 dw -2048 83pw_4096: times 8 dw 4096 84pw_16384: times 8 dw 16384 85pw_m16384: times 8 dw -16384 86pw_1697x16: times 8 dw 1697*16 87pw_1697x8: times 8 dw 1697*8 88pw_2896x8: times 8 dw 2896*8 89pw_3344x8: times 8 dw 3344*8 90pw_8192: times 8 dw 8192 91pw_m8192: times 8 dw -8192 92pw_5: times 8 dw 5 93pw_201x8: times 8 dw 201*8 94pw_4091x8: times 8 dw 4091*8 95pw_m2751x8: times 8 dw -2751*8 96pw_3035x8: times 8 dw 3035*8 97pw_1751x8: times 8 dw 1751*8 98pw_3703x8: times 8 dw 3703*8 99pw_m1380x8: times 8 dw -1380*8 100pw_3857x8: times 8 dw 3857*8 101pw_995x8: times 8 dw 995*8 102pw_3973x8: times 8 dw 3973*8 103pw_m2106x8: times 8 dw -2106*8 104pw_3513x8: times 8 dw 3513*8 105pw_2440x8: times 8 dw 2440*8 106pw_3290x8: times 8 dw 3290*8 107pw_m601x8: times 8 dw -601*8 108pw_4052x8: times 8 dw 4052*8 109 110pw_4095x8: times 8 dw 4095*8 111pw_101x8: times 8 dw 101*8 112pw_2967x8: times 8 dw 2967*8 113pw_m2824x8: times 8 dw -2824*8 114pw_3745x8: times 8 dw 3745*8 115pw_1660x8: times 8 dw 1660*8 116pw_3822x8: times 8 dw 3822*8 117pw_m1474x8: times 8 dw -1474*8 118pw_3996x8: times 8 dw 3996*8 119pw_897x8: times 8 dw 897*8 120pw_3461x8: times 8 dw 3461*8 121pw_m2191x8: times 8 dw -2191*8 122pw_3349x8: times 8 dw 3349*8 123pw_2359x8: times 8 dw 2359*8 124pw_4036x8: times 8 dw 4036*8 125pw_m700x8: times 8 dw -700*8 126pw_4065x8: times 8 dw 4065*8 127pw_501x8: times 8 dw 501*8 128pw_3229x8: times 8 dw 3229*8 129pw_m2520x8: times 8 dw -2520*8 130pw_3564x8: times 8 dw 3564*8 131pw_2019x8: times 8 dw 2019*8 132pw_3948x8: times 8 dw 3948*8 133pw_m1092x8: times 8 dw -1092*8 134pw_3889x8: times 8 dw 3889*8 135pw_1285x8: times 8 dw 1285*8 136pw_3659x8: times 8 dw 3659*8 137pw_m1842x8: times 8 dw -1842*8 138pw_3102x8: times 8 dw 3102*8 139pw_2675x8: times 8 dw 2675*8 140pw_4085x8: times 8 dw 4085*8 141pw_m301x8: times 8 dw -301*8 142 143SECTION .text 144 145%macro REPX 2-* 146 %xdefine %%f(x) %1 147%rep %0 - 1 148 %rotate 1 149 %%f(%1) 150%endrep 151%endmacro 152 153%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) 154 155%if ARCH_X86_64 156%define o(x) x 157%else 158%define o(x) r5-$$+x ; PIC 159%endif 160 161%macro WRITE_4X4 9 ;src[1-2], tmp[1-3], row[1-4] 162 lea r2, [dstq+strideq*2] 163%assign %%i 1 164%rotate 5 165%rep 4 166 %if %1 & 2 167 CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) 168 %else 169 CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) 170 %endif 171 %assign %%i %%i + 1 172 %rotate 1 173%endrep 174 175 movd m%3, [%%row_adr1] ;dst0 176 movd m%5, [%%row_adr2] ;dst1 177 punpckldq m%3, m%5 ;high: dst1 :low: dst0 178 movd m%4, [%%row_adr3] ;dst2 179 movd m%5, [%%row_adr4] ;dst3 180 punpckldq m%4, m%5 ;high: dst3 :low: dst2 181 182 pxor m%5, m%5 183 punpcklbw m%3, m%5 ;extend byte to word 184 punpcklbw m%4, m%5 ;extend byte to word 185 186 paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0 187 paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2 188 189 packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0 190 191 movd [%%row_adr1], m%3 ;store dst0 + out0 192 pshuflw m%4, m%3, q1032 193 movd [%%row_adr2], m%4 ;store dst1 + out1 194 punpckhqdq m%3, m%3 195 movd [%%row_adr3], m%3 ;store dst2 + out2 196 psrlq m%3, 32 197 movd [%%row_adr4], m%3 ;store dst3 + out3 198%endmacro 199 200%macro ITX4_END 4-5 2048 ; row[1-4], rnd 201%if %5 202 mova m2, [o(pw_%5)] 203 pmulhrsw m0, m2 204 pmulhrsw m1, m2 205%endif 206 207 WRITE_4X4 0, 1, 2, 3, 4, %1, %2, %3, %4 208 ret 209%endmacro 210 211; flags: 1 = swap, 2: coef_regs, 4: no_pack 212%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags 213%if %6 & 2 214 pmaddwd m%2, m%4, m%1 215 pmaddwd m%1, m%5 216%elif %6 & 1 217 pmaddwd m%2, m%1, [o(pw_%5_%4)] 218 pmaddwd m%1, [o(pw_%4_m%5)] 219%else 220 pmaddwd m%2, m%1, [o(pw_%4_m%5)] 221 pmaddwd m%1, [o(pw_%5_%4)] 222%endif 223 paddd m%2, m%3 224 paddd m%1, m%3 225 psrad m%2, 12 226 psrad m%1, 12 227%if %6 & 4 == 0 228 packssdw m%1, m%2 229%endif 230%endmacro 231 232%macro IDCT4_1D_PACKED 0-1 ;pw_2896x8 233 mova m3, [o(pd_2048)] 234 punpckhwd m2, m0, m1 ;unpacked in1 in3 235 punpcklwd m0, m1 ;unpacked in0 in2 236 ITX_MUL2X_PACK 2, 1, 3, 1567, 3784 237 ITX_MUL2X_PACK 0, 1, 3, 2896, 2896 238 psubsw m1, m0, m2 ;high: out2 ;low: out3 239 paddsw m0, m2 ;high: out1 ;low: out0 240%endmacro 241 242%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack 243cglobal inv_txfm_add_%1_%2_%3, 4, 6, %4, dst, stride, coeff, eob, tx2 244 %define %%p1 m(i%1_%3_internal) 245%if ARCH_X86_32 246 LEA r5, $$ 247%endif 248%if has_epilogue 249%ifidn %1_%2, dct_dct 250 test eobd, eobd 251 jz %%end 252%endif 253 lea tx2q, [o(m(i%2_%3_internal).pass2)] 254 call %%p1 255 RET 256%%end: 257%else 258 lea tx2q, [o(m(i%2_%3_internal).pass2)] 259%ifidn %1_%2, dct_dct 260 test eobd, eobd 261 jnz %%p1 262%else 263 times ((%%end - %%p1) >> 31) & 1 jmp %%p1 264ALIGN function_align 265%%end: 266%endif 267%endif 268%endmacro 269 270%macro INV_TXFM_4X4_FN 2 ; type1, type2 271 INV_TXFM_FN %1, %2, 4x4, 6 272%ifidn %1_%2, dct_dct 273 pshuflw m0, [coeffq], q0000 274 punpcklqdq m0, m0 275 mova m1, [o(pw_2896x8)] 276 pmulhrsw m0, m1 277 mov [coeffq], eobd ;0 278 pmulhrsw m0, m1 279 mova m1, m0 280 TAIL_CALL m(iadst_4x4_internal).end2 281%endif 282%endmacro 283 284INIT_XMM ssse3 285 286INV_TXFM_4X4_FN dct, dct 287INV_TXFM_4X4_FN dct, adst 288INV_TXFM_4X4_FN dct, flipadst 289INV_TXFM_4X4_FN dct, identity 290 291cglobal idct_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 292 mova m0, [coeffq+16*0] ;high: in1 ;low: in0 293 mova m1, [coeffq+16*1] ;high: in3 ;low in2 294 295 IDCT4_1D_PACKED 296 297 mova m2, [o(deint_shuf)] 298 shufps m3, m0, m1, q1331 299 shufps m0, m1, q0220 300 pshufb m0, m2 ;high: in1 ;low: in0 301 pshufb m1, m3, m2 ;high: in3 ;low :in2 302 jmp tx2q 303 304.pass2: 305 IDCT4_1D_PACKED 306 307 pxor m2, m2 308 mova [coeffq+16*0], m2 309 mova [coeffq+16*1], m2 ;memset(coeff, 0, sizeof(*coeff) * sh * sw); 310 311 ITX4_END 0, 1, 3, 2 312 313INV_TXFM_4X4_FN adst, dct 314INV_TXFM_4X4_FN adst, adst 315INV_TXFM_4X4_FN adst, flipadst 316INV_TXFM_4X4_FN adst, identity 317 318cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 319 mova m0, [coeffq+16*0] 320 mova m1, [coeffq+16*1] 321 call .main 322 punpckhwd m2, m0, m1 323 punpcklwd m0, m1 324 punpckhwd m1, m0, m2 ;high: in3 ;low :in2 325 punpcklwd m0, m2 ;high: in1 ;low: in0 326 jmp tx2q 327 328.pass2: 329 call .main 330 331.end: 332 pxor m2, m2 333 mova [coeffq+16*0], m2 334 mova [coeffq+16*1], m2 335 336.end2: 337 ITX4_END 0, 1, 2, 3 338 339ALIGN function_align 340.main: 341 punpcklwd m2, m0, m1 ;unpacked in0 in2 342 punpckhwd m0, m1 ;unpacked in1 in3 343 mova m3, m0 344 pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2 345 pmaddwd m0, [o(pw_0_3344)] ;3344 * in3 346 paddd m1, m0 ;t2 347 pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 348 pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 349 pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 350 pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3 351 paddd m4, m0 ;t0 + t3 352 pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 353 mova m0, [o(pd_2048)] 354 paddd m1, m0 ;t2 + 2048 355 paddd m2, m0 356 paddd m0, m4 ;t0 + t3 + 2048 357 paddd m5, m2 ;t1 + t3 + 2048 358 paddd m2, m4 359 paddd m2, m3 ;t0 + t1 - t3 + 2048 360 REPX {psrad x, 12}, m1, m0, m5, m2 361 packssdw m0, m5 ;high: out1 ;low: out0 362 packssdw m1, m2 ;high: out3 ;low: out3 363 ret 364 365INV_TXFM_4X4_FN flipadst, dct 366INV_TXFM_4X4_FN flipadst, adst 367INV_TXFM_4X4_FN flipadst, flipadst 368INV_TXFM_4X4_FN flipadst, identity 369 370cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 371 mova m0, [coeffq+16*0] 372 mova m1, [coeffq+16*1] 373 call m(iadst_4x4_internal).main 374 punpcklwd m2, m1, m0 375 punpckhwd m1, m0 376 punpcklwd m0, m1, m2 ;high: in3 ;low :in2 377 punpckhwd m1, m2 ;high: in1 ;low: in0 378 jmp tx2q 379 380.pass2: 381 call m(iadst_4x4_internal).main 382 383.end: 384 pxor m2, m2 385 mova [coeffq+16*0], m2 386 mova [coeffq+16*1], m2 387 388.end2: 389 ITX4_END 3, 2, 1, 0 390 391INV_TXFM_4X4_FN identity, dct 392INV_TXFM_4X4_FN identity, adst 393INV_TXFM_4X4_FN identity, flipadst 394INV_TXFM_4X4_FN identity, identity 395 396cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 397 mova m0, [coeffq+16*0] 398 mova m1, [coeffq+16*1] 399 mova m3, [o(pw_1697x8)] 400 pmulhrsw m2, m0, m3 401 pmulhrsw m3, m1 402 paddsw m0, m2 403 paddsw m1, m3 404 punpckhwd m2, m0, m1 405 punpcklwd m0, m1 406 punpckhwd m1, m0, m2 ;high: in3 ;low :in2 407 punpcklwd m0, m2 ;high: in1 ;low: in0 408 jmp tx2q 409 410.pass2: 411 mova m3, [o(pw_1697x8)] 412 pmulhrsw m2, m3, m0 413 pmulhrsw m3, m1 414 paddsw m0, m2 415 paddsw m1, m3 416 jmp m(iadst_4x4_internal).end 417 418%macro IWHT4_1D_PACKED 0 419 punpckhqdq m3, m0, m1 ;low: in1 high: in3 420 punpcklqdq m0, m1 ;low: in0 high: in2 421 psubw m2, m0, m3 ;low: in0 - in1 high: in2 - in3 422 paddw m0, m3 ;low: in0 + in1 high: in2 + in3 423 punpckhqdq m2, m2 ;t2 t2 424 punpcklqdq m0, m0 ;t0 t0 425 psubw m1, m0, m2 426 psraw m1, 1 ;t4 t4 427 psubw m1, m3 ;low: t1/out2 high: t3/out1 428 psubw m0, m1 ;high: out0 429 paddw m2, m1 ;low: out3 430%endmacro 431 432cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff 433 mova m0, [coeffq+16*0] 434 mova m1, [coeffq+16*1] 435 pxor m2, m2 436 mova [coeffq+16*0], m2 437 mova [coeffq+16*1], m2 438 psraw m0, 2 439 psraw m1, 2 440 441 IWHT4_1D_PACKED 442 443 punpckhwd m0, m1 444 punpcklwd m3, m1, m2 445 punpckhdq m1, m0, m3 446 punpckldq m0, m3 447 448 IWHT4_1D_PACKED 449 450 shufpd m0, m2, 0x01 451 ITX4_END 0, 3, 2, 1, 0 452 453 454%macro IDCT8_1D_PACKED 0 455 mova m6, [o(pd_2048)] 456 punpckhwd m4, m0, m3 ;unpacked in1 in7 457 punpcklwd m0, m2 ;unpacked in0 in4 458 punpckhwd m2, m1 ;unpacked in5 in3 459 punpcklwd m1, m3 ;unpacked in2 in6 460 ITX_MUL2X_PACK 4, 3, 6, 799, 4017 ;low: t7a high: t4a 461 ITX_MUL2X_PACK 2, 3, 6, 3406, 2276 ;low: t6a high: t5a 462 ITX_MUL2X_PACK 1, 3, 6, 1567, 3784 ;low: t3 high: t2 463 psubsw m3, m4, m2 ;low: t6a high: t5a 464 paddsw m4, m2 ;low: t7 high: t4 465 pshufb m3, [o(deint_shuf1)] 466 ITX_MUL2X_PACK 0, 2, 6, 2896, 2896 ;low: t0 high: t1 467 ITX_MUL2X_PACK 3, 2, 6, 2896, 2896 ;low: t6 high: t5 468 psubsw m2, m0, m1 ;low: tmp3 high: tmp2 469 paddsw m0, m1 ;low: tmp0 high: tmp1 470 punpcklqdq m1, m4, m3 ;low: t7 high: t6 471 punpckhqdq m4, m3 ;low: t4 high: t5 472 psubsw m3, m0, m1 ;low: out7 high: out6 473 paddsw m0, m1 ;low: out0 high: out1 474 paddsw m1, m2, m4 ;low: out3 high: out2 475 psubsw m2, m4 ;low: out4 high: out5 476%endmacro 477 478;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 479;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 480%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1 481 punpckhwd m%4, m%1, m%2 482 punpcklwd m%1, m%2 483%if %7 < 8 484 pmaddwd m%2, m%7, m%1 485 pmaddwd m%3, m%7, m%4 486%else 487 mova m%2, [o(pw_%7_%6)] 488%if %8 489 pmaddwd m%3, m%1, m%2 490 pmaddwd m%2, m%4 491%else 492 pmaddwd m%3, m%4, m%2 493 pmaddwd m%2, m%1 494%endif 495%endif 496 paddd m%3, m%5 497 paddd m%2, m%5 498 psrad m%3, 12 499 psrad m%2, 12 500%if %8 501 packssdw m%3, m%2 502%else 503 packssdw m%2, m%3 ;dst2 504%endif 505%if %7 < 8 506 pmaddwd m%4, m%6 507 pmaddwd m%1, m%6 508%elif %8 509 mova m%2, [o(pw_%6_m%7)] 510 pmaddwd m%4, m%2 511 pmaddwd m%1, m%2 512%else 513 mova m%3, [o(pw_%6_m%7)] 514 pmaddwd m%4, m%3 515 pmaddwd m%1, m%3 516%endif 517 paddd m%4, m%5 518 paddd m%1, m%5 519 psrad m%4, 12 520 psrad m%1, 12 521 packssdw m%1, m%4 ;dst1 522%endmacro 523 524%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 525 ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3 526 ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0 527 psubsw m%3, m%1, m%2 ;out2 528 paddsw m%2, m%1 ;out1 529 paddsw m%1, m%5, m%4 ;out0 530 psubsw m%4, m%5 ;out3 531%endmacro 532 533%macro WRITE_4X8 4 ;row[1-4] 534 WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4 535 lea dstq, [dstq+strideq*4] 536 WRITE_4X4 2, 3, 4, 5, 6, %1, %2, %3, %4 537%endmacro 538 539%macro INV_4X8 0 540 punpckhwd m4, m2, m3 541 punpcklwd m2, m3 542 punpckhwd m3, m0, m1 543 punpcklwd m0, m1 544 punpckhdq m1, m0, m2 ;low: in2 high: in3 545 punpckldq m0, m2 ;low: in0 high: in1 546 punpckldq m2, m3, m4 ;low: in4 high: in5 547 punpckhdq m3, m4 ;low: in6 high: in7 548%endmacro 549 550%macro INV_TXFM_4X8_FN 2 ; type1, type2 551 INV_TXFM_FN %1, %2, 4x8, 8 552%ifidn %1_%2, dct_dct 553 pshuflw m0, [coeffq], q0000 554 punpcklqdq m0, m0 555 mova m1, [o(pw_2896x8)] 556 pmulhrsw m0, m1 557 mov [coeffq], eobd 558 pmulhrsw m0, m1 559 pmulhrsw m0, m1 560 pmulhrsw m0, [o(pw_2048)] 561 mova m1, m0 562 mova m2, m0 563 mova m3, m0 564 TAIL_CALL m(iadst_4x8_internal).end3 565%endif 566%endmacro 567 568INV_TXFM_4X8_FN dct, dct 569INV_TXFM_4X8_FN dct, adst 570INV_TXFM_4X8_FN dct, flipadst 571INV_TXFM_4X8_FN dct, identity 572 573cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 574 mova m3, [o(pw_2896x8)] 575 pmulhrsw m0, m3, [coeffq+16*0] 576 pmulhrsw m1, m3, [coeffq+16*1] 577 pmulhrsw m2, m3, [coeffq+16*2] 578 pmulhrsw m3, [coeffq+16*3] 579 580.pass1: 581 call m(idct_8x4_internal).main 582 jmp m(iadst_4x8_internal).pass1_end 583 584.pass2: 585 call .main 586 shufps m1, m1, q1032 587 shufps m3, m3, q1032 588 mova m4, [o(pw_2048)] 589 jmp m(iadst_4x8_internal).end2 590 591ALIGN function_align 592.main: 593 IDCT8_1D_PACKED 594 ret 595 596 597INV_TXFM_4X8_FN adst, dct 598INV_TXFM_4X8_FN adst, adst 599INV_TXFM_4X8_FN adst, flipadst 600INV_TXFM_4X8_FN adst, identity 601 602cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 603 mova m3, [o(pw_2896x8)] 604 pmulhrsw m0, m3, [coeffq+16*0] 605 pmulhrsw m1, m3, [coeffq+16*1] 606 pmulhrsw m2, m3, [coeffq+16*2] 607 pmulhrsw m3, [coeffq+16*3] 608 609.pass1: 610 call m(iadst_8x4_internal).main 611 612.pass1_end: 613 INV_4X8 614 jmp tx2q 615 616.pass2: 617 shufps m0, m0, q1032 618 shufps m1, m1, q1032 619 call .main 620 mova m4, [o(pw_2048)] 621 pxor m5, m5 622 psubw m5, m4 623 624.end: 625 punpcklqdq m4, m5 626 627.end2: 628 pmulhrsw m0, m4 629 pmulhrsw m1, m4 630 pmulhrsw m2, m4 631 pmulhrsw m3, m4 632 pxor m5, m5 633 mova [coeffq+16*0], m5 634 mova [coeffq+16*1], m5 635 mova [coeffq+16*2], m5 636 mova [coeffq+16*3], m5 637 638.end3: 639 WRITE_4X8 0, 1, 2, 3 640 RET 641 642ALIGN function_align 643.main: 644 mova m6, [o(pd_2048)] 645 punpckhwd m4, m3, m0 ;unpacked in7 in0 646 punpckhwd m5, m2, m1 ;unpacked in5 in2 647 punpcklwd m1, m2 ;unpacked in3 in4 648 punpcklwd m0, m3 ;unpacked in1 in6 649 ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a 650 ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a 651 ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a 652 ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a 653 654 psubsw m3, m4, m1 ;low: t4 high: t5 655 paddsw m4, m1 ;low: t0 high: t1 656 psubsw m2, m5, m0 ;low: t6 high: t7 657 paddsw m5, m0 ;low: t2 high: t3 658 659 shufps m1, m3, m2, q1032 660 punpckhwd m2, m1 661 punpcklwd m3, m1 662 ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a 663 ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a 664 665 psubsw m1, m4, m5 ;low: t2 high: t3 666 paddsw m4, m5 ;low: out0 high: -out7 667 psubsw m5, m3, m2 ;low: t7 high: t6 668 paddsw m3, m2 ;low: out6 high: -out1 669 shufps m0, m4, m3, q3210 ;low: out0 high: -out1 670 shufps m3, m4, q3210 ;low: out6 high: -out7 671 672 mova m2, [o(pw_2896_m2896)] 673 mova m7, [o(pw_2896_2896)] 674 shufps m4, m1, m5, q1032 ;low: t3 high: t7 675 shufps m1, m5, q3210 ;low: t2 high: t6 676 punpcklwd m5, m1, m4 677 punpckhwd m1, m4 678 pmaddwd m4, m2, m1 ;-out5 679 pmaddwd m2, m5 ; out4 680 pmaddwd m1, m7 ; out2 681 pmaddwd m5, m7 ;-out3 682 REPX {paddd x, m6}, m4, m2, m1, m5 683 REPX {psrad x, 12}, m4, m2, m1, m5 684 packssdw m1, m5 ;low: out2 high: -out3 685 packssdw m2, m4 ;low: out4 high: -out5 686 ret 687 688INV_TXFM_4X8_FN flipadst, dct 689INV_TXFM_4X8_FN flipadst, adst 690INV_TXFM_4X8_FN flipadst, flipadst 691INV_TXFM_4X8_FN flipadst, identity 692 693cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 694 mova m3, [o(pw_2896x8)] 695 pmulhrsw m0, m3, [coeffq+16*0] 696 pmulhrsw m1, m3, [coeffq+16*1] 697 pmulhrsw m2, m3, [coeffq+16*2] 698 pmulhrsw m3, [coeffq+16*3] 699 700.pass1: 701 call m(iadst_8x4_internal).main 702 703 punpcklwd m4, m3, m2 704 punpckhwd m3, m2 705 punpcklwd m5, m1, m0 706 punpckhwd m1, m0 707 punpckldq m2, m3, m1 ;low: in4 high: in5 708 punpckhdq m3, m1 ;low: in6 high: in7 709 punpckldq m0, m4, m5 ;low: in0 high: in1 710 punpckhdq m1, m4, m5 ;low: in2 high: in3 711 jmp tx2q 712 713.pass2: 714 shufps m0, m0, q1032 715 shufps m1, m1, q1032 716 call m(iadst_4x8_internal).main 717 718 mova m4, m0 719 mova m5, m1 720 pshufd m0, m3, q1032 721 pshufd m1, m2, q1032 722 pshufd m2, m5, q1032 723 pshufd m3, m4, q1032 724 mova m5, [o(pw_2048)] 725 pxor m4, m4 726 psubw m4, m5 727 jmp m(iadst_4x8_internal).end 728 729INV_TXFM_4X8_FN identity, dct 730INV_TXFM_4X8_FN identity, adst 731INV_TXFM_4X8_FN identity, flipadst 732INV_TXFM_4X8_FN identity, identity 733 734cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 735 mova m3, [o(pw_2896x8)] 736 pmulhrsw m0, m3, [coeffq+16*0] 737 pmulhrsw m1, m3, [coeffq+16*1] 738 pmulhrsw m2, m3, [coeffq+16*2] 739 pmulhrsw m3, [coeffq+16*3] 740 741.pass1: 742 mova m7, [o(pw_1697x8)] 743 pmulhrsw m4, m7, m0 744 pmulhrsw m5, m7, m1 745 pmulhrsw m6, m7, m2 746 pmulhrsw m7, m3 747 paddsw m0, m4 748 paddsw m1, m5 749 paddsw m2, m6 750 paddsw m3, m7 751 jmp m(iadst_4x8_internal).pass1_end 752 753.pass2: 754 mova m4, [o(pw_4096)] 755 jmp m(iadst_4x8_internal).end2 756 757 758%macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3] 759 movq m%3, [dstq ] 760 movq m%4, [dstq+strideq] 761 pxor m%5, m%5 762 punpcklbw m%3, m%5 ;extend byte to word 763 punpcklbw m%4, m%5 ;extend byte to word 764%ifnum %1 765 paddw m%3, m%1 766%else 767 paddw m%3, %1 768%endif 769%ifnum %2 770 paddw m%4, m%2 771%else 772 paddw m%4, %2 773%endif 774 packuswb m%3, m%4 775 movq [dstq ], m%3 776 punpckhqdq m%3, m%3 777 movq [dstq+strideq], m%3 778%endmacro 779 780%macro WRITE_8X4 7 ;coefs[1-4], tmp[1-3] 781 WRITE_8X2 %1, %2, %5, %6, %7 782 lea dstq, [dstq+strideq*2] 783 WRITE_8X2 %3, %4, %5, %6, %7 784%endmacro 785 786%macro INV_TXFM_8X4_FN 2 ; type1, type2 787 INV_TXFM_FN %1, %2, 8x4, 8 788%ifidn %1_%2, dct_dct 789 pshuflw m0, [coeffq], q0000 790 punpcklqdq m0, m0 791 mova m1, [o(pw_2896x8)] 792 pmulhrsw m0, m1 793 pmulhrsw m0, m1 794 mova m2, [o(pw_2048)] 795 pmulhrsw m0, m1 796 pmulhrsw m0, m2 797 mova m1, m0 798 mova m2, m0 799 mova m3, m0 800 TAIL_CALL m(iadst_8x4_internal).end2 801%endif 802%endmacro 803 804INV_TXFM_8X4_FN dct, dct 805INV_TXFM_8X4_FN dct, adst 806INV_TXFM_8X4_FN dct, flipadst 807INV_TXFM_8X4_FN dct, identity 808 809cglobal idct_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 810 mova m3, [o(pw_2896x8)] 811 pmulhrsw m0, m3, [coeffq+16*0] 812 pmulhrsw m1, m3, [coeffq+16*1] 813 pmulhrsw m2, m3, [coeffq+16*2] 814 pmulhrsw m3, [coeffq+16*3] 815 816 call m(idct_4x8_internal).main 817 818 mova m4, [o(deint_shuf1)] 819 mova m5, [o(deint_shuf2)] 820 pshufb m0, m4 821 pshufb m1, m5 822 pshufb m2, m4 823 pshufb m3, m5 824 punpckhdq m4, m0, m1 825 punpckldq m0, m1 826 punpckhdq m5, m2, m3 827 punpckldq m2, m3 828 punpckhqdq m1, m0, m2 ;in1 829 punpcklqdq m0, m2 ;in0 830 punpckhqdq m3, m4, m5 ;in3 831 punpcklqdq m2 ,m4, m5 ;in2 832 jmp tx2q 833 834.pass2: 835 call .main 836 jmp m(iadst_8x4_internal).end 837 838ALIGN function_align 839.main: 840 mova m6, [o(pd_2048)] 841 IDCT4_1D 0, 1, 2, 3, 4, 5, 6 842 ret 843 844INV_TXFM_8X4_FN adst, dct 845INV_TXFM_8X4_FN adst, adst 846INV_TXFM_8X4_FN adst, flipadst 847INV_TXFM_8X4_FN adst, identity 848 849cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 850 mova m3, [o(pw_2896x8)] 851 pmulhrsw m0, m3, [coeffq+16*0] 852 pmulhrsw m1, m3, [coeffq+16*1] 853 pmulhrsw m2, m3, [coeffq+16*2] 854 pmulhrsw m3, [coeffq+16*3] 855 856 shufps m0, m0, q1032 857 shufps m1, m1, q1032 858 call m(iadst_4x8_internal).main 859 860 punpckhwd m4, m0, m1 861 punpcklwd m0, m1 862 punpckhwd m1, m2, m3 863 punpcklwd m2, m3 864 pxor m5, m5 865 psubsw m3, m5, m1 866 psubsw m5, m4 867 punpckhdq m4, m5, m3 868 punpckldq m5, m3 869 punpckhdq m3, m0, m2 870 punpckldq m0, m2 871 punpckhwd m1, m0, m5 ;in1 872 punpcklwd m0, m5 ;in0 873 punpcklwd m2, m3, m4 ;in2 874 punpckhwd m3, m4 ;in3 875 jmp tx2q 876 877.pass2: 878 call .main 879 880.end: 881 mova m4, [o(pw_2048)] 882 pmulhrsw m0, m4 883 pmulhrsw m1, m4 884 pmulhrsw m2, m4 885 pmulhrsw m3, m4 886 887.end2: 888 pxor m6, m6 889 mova [coeffq+16*0], m6 890 mova [coeffq+16*1], m6 891 mova [coeffq+16*2], m6 892 mova [coeffq+16*3], m6 893.end3: 894 WRITE_8X4 0, 1, 2, 3, 4, 5, 6 895 RET 896 897ALIGN function_align 898.main: 899 punpckhwd m6, m0, m2 ;unpacked in0 in2 900 punpcklwd m0, m2 ;unpacked in0 in2 901 punpckhwd m7, m1, m3 ;unpacked in1 in3 902 punpcklwd m1, m3 ;unpacked in1 in3 903 904 mova m2, [o(pw_3344_m3344)] 905 mova m4, [o(pw_0_3344)] 906 pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2 907 pmaddwd m5, m4, m7 ;3344 * in3 908 pmaddwd m2, m0 909 pmaddwd m4, m1 910 paddd m3, m5 911 paddd m2, m4 912 mova m4, [o(pd_2048)] 913 paddd m3, m4 ;t2 + 2048 914 paddd m2, m4 915 psrad m3, 12 916 psrad m2, 12 917 packssdw m2, m3 ;out2 918 919 pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 920 pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 921 pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 922 pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 923 paddd m3, m4 ;t0 + t3 924 925 pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 926 mova m4, [o(pd_2048)] 927 paddd m0, m4 928 paddd m4, m3 ;t0 + t3 + 2048 929 paddd m5, m0 ;t1 + t3 + 2048 930 paddd m3, m0 931 paddd m3, m1 ;t0 + t1 - t3 + 2048 932 933 psrad m4, 12 ;out0 934 psrad m5, 12 ;out1 935 psrad m3, 12 ;out3 936 packssdw m0, m4, m5 ;low: out0 high: out1 937 938 pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 939 pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 940 pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 941 pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 942 paddd m1, m4 ;t0 + t3 943 pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 944 945 mova m4, [o(pd_2048)] 946 paddd m6, m4 947 paddd m4, m1 ;t0 + t3 + 2048 948 paddd m5, m6 ;t1 + t3 + 2048 949 paddd m1, m6 950 paddd m1, m7 ;t0 + t1 - t3 + 2048 951 952 psrad m4, 12 ;out0 953 psrad m5, 12 ;out1 954 psrad m1, 12 ;out3 955 packssdw m3, m1 ;out3 956 packssdw m4, m5 ;low: out0 high: out1 957 958 punpckhqdq m1, m0, m4 ;out1 959 punpcklqdq m0, m4 ;out0 960 ret 961 962INV_TXFM_8X4_FN flipadst, dct 963INV_TXFM_8X4_FN flipadst, adst 964INV_TXFM_8X4_FN flipadst, flipadst 965INV_TXFM_8X4_FN flipadst, identity 966 967cglobal iflipadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 968 mova m3, [o(pw_2896x8)] 969 pmulhrsw m0, m3, [coeffq+16*0] 970 pmulhrsw m1, m3, [coeffq+16*1] 971 pmulhrsw m2, m3, [coeffq+16*2] 972 pmulhrsw m3, [coeffq+16*3] 973 974 shufps m0, m0, q1032 975 shufps m1, m1, q1032 976 call m(iadst_4x8_internal).main 977 978 punpckhwd m5, m3, m2 979 punpcklwd m3, m2 980 punpckhwd m2, m1, m0 981 punpcklwd m1, m0 982 983 pxor m0, m0 984 psubsw m4, m0, m2 985 psubsw m0, m5 986 punpckhdq m2, m0, m4 987 punpckldq m0, m4 988 punpckhdq m4, m3, m1 989 punpckldq m3, m1 990 punpckhwd m1, m0, m3 ;in1 991 punpcklwd m0, m3 ;in0 992 punpckhwd m3, m2, m4 ;in3 993 punpcklwd m2, m4 ;in2 994 jmp tx2q 995 996.pass2: 997 call m(iadst_8x4_internal).main 998 mova m4, m0 999 mova m5, m1 1000 mova m0, m3 1001 mova m1, m2 1002 mova m2, m5 1003 mova m3, m4 1004 jmp m(iadst_8x4_internal).end 1005 1006INV_TXFM_8X4_FN identity, dct 1007INV_TXFM_8X4_FN identity, adst 1008INV_TXFM_8X4_FN identity, flipadst 1009INV_TXFM_8X4_FN identity, identity 1010 1011cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 1012 mova m3, [o(pw_2896x8)] 1013 pmulhrsw m0, m3, [coeffq+16*0] 1014 pmulhrsw m1, m3, [coeffq+16*1] 1015 pmulhrsw m2, m3, [coeffq+16*2] 1016 pmulhrsw m3, [coeffq+16*3] 1017 paddsw m0, m0 1018 paddsw m1, m1 1019 paddsw m2, m2 1020 paddsw m3, m3 1021 1022 punpckhwd m4, m0, m1 1023 punpcklwd m0, m1 1024 punpckhwd m1, m2, m3 1025 punpcklwd m2, m3 1026 punpckhdq m5, m4, m1 1027 punpckldq m4, m1 1028 punpckhdq m3, m0, m2 1029 punpckldq m0, m2 1030 punpckhwd m1, m0, m4 ;in1 1031 punpcklwd m0, m4 ;in0 1032 punpcklwd m2, m3, m5 ;in2 1033 punpckhwd m3, m5 ;in3 1034 jmp tx2q 1035 1036.pass2: 1037 mova m7, [o(pw_1697x8)] 1038 pmulhrsw m4, m7, m0 1039 pmulhrsw m5, m7, m1 1040 pmulhrsw m6, m7, m2 1041 pmulhrsw m7, m3 1042 paddsw m0, m4 1043 paddsw m1, m5 1044 paddsw m2, m6 1045 paddsw m3, m7 1046 jmp m(iadst_8x4_internal).end 1047 1048%macro INV_TXFM_8X8_FN 2 ; type1, type2 1049 INV_TXFM_FN %1, %2, 8x8, 8, 16*4 1050%ifidn %1_%2, dct_dct 1051 pshuflw m0, [coeffq], q0000 1052 punpcklwd m0, m0 1053 mova m1, [o(pw_2896x8)] 1054 pmulhrsw m0, m1 1055 mova m2, [o(pw_16384)] 1056 mov [coeffq], eobd 1057 pmulhrsw m0, m2 1058 psrlw m2, 3 1059 pmulhrsw m0, m1 1060 pmulhrsw m0, m2 1061.end: 1062 mov r3d, 2 1063 lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8).end3)] 1064.loop: 1065 WRITE_8X4 0, 0, 0, 0, 1, 2, 3 1066 lea dstq, [dstq+strideq*2] 1067 dec r3d 1068 jg .loop 1069 jmp tx2q 1070.end3: 1071 RET 1072%endif 1073%endmacro 1074 1075%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 1076%if %3 1077 mova m7, [o(pw_2896x8)] 1078 pmulhrsw m0, m7, [%1+%2*0] 1079 pmulhrsw m1, m7, [%1+%2*1] 1080 pmulhrsw m2, m7, [%1+%2*2] 1081 pmulhrsw m3, m7, [%1+%2*3] 1082 pmulhrsw m4, m7, [%1+%2*4] 1083 pmulhrsw m5, m7, [%1+%2*5] 1084 pmulhrsw m6, m7, [%1+%2*6] 1085 pmulhrsw m7, [%1+%2*7] 1086%else 1087 mova m0, [%1+%2*0] 1088 mova m1, [%1+%2*1] 1089 mova m2, [%1+%2*2] 1090 mova m3, [%1+%2*3] 1091 mova m4, [%1+%2*4] 1092 mova m5, [%1+%2*5] 1093 mova m6, [%1+%2*6] 1094 mova m7, [%1+%2*7] 1095%endif 1096%endmacro 1097 1098%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048 1099 ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a 1100 ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a 1101 psubsw m%2, m%4, m%5 ;t6a 1102 paddsw m%4, m%5 ;t7 1103 psubsw m%5, m%1, m%3 ;t5a 1104 paddsw m%1, m%3 ;t4 1105 ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6 1106%endmacro 1107 1108INV_TXFM_8X8_FN dct, dct 1109INV_TXFM_8X8_FN dct, adst 1110INV_TXFM_8X8_FN dct, flipadst 1111INV_TXFM_8X8_FN dct, identity 1112 1113cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 1114 LOAD_8ROWS coeffq, 16 1115 1116.pass1: 1117 call .main 1118 1119.pass1_end: 1120 mova m7, [o(pw_16384)] 1121 1122.pass1_end1: 1123 REPX {pmulhrsw x, m7}, m0, m2, m4, m6 1124 mova [rsp+gprsize+16*1], m6 1125 1126.pass1_end2: 1127 REPX {pmulhrsw x, m7}, m1, m3, m5 1128 pmulhrsw m7, [rsp+gprsize+16*0] 1129 1130.pass1_end3: 1131 punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53 1132 punpckhwd m1, m5 ;14 54 15 55 16 56 17 57 1133 punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47 1134 punpcklwd m0, m4 ;00 40 01 41 02 42 03 43 1135 punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77 1136 punpcklwd m3, m7 ;30 70 31 71 32 72 33 73 1137 punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77 1138 punpcklwd m1, m4 ;14 34 54 74 15 35 55 75 1139 punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73 1140 punpcklwd m6, m3 ;10 30 50 70 11 31 51 71 1141 mova [rsp+gprsize+16*2], m6 1142 mova m6, [rsp+gprsize+16*1] 1143 punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67 1144 punpcklwd m2, m6 ;20 60 21 61 22 62 23 63 1145 punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67 1146 punpcklwd m5, m3 ;04 24 44 64 05 25 45 65 1147 punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63 1148 punpcklwd m0, m2 ;00 20 40 60 01 21 41 61 1149 1150 punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77 1151 punpcklwd m6, m7 ;06 16 26 36 46 56 66 76 1152 mova [rsp+gprsize+16*0], m2 1153 punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72 1154 punpckhwd m3, m4 ;03 13 23 33 43 53 63 73 1155 punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74 1156 punpckhwd m5, m1 ;05 15 25 35 45 55 65 75 1157 mova m7, [rsp+gprsize+16*2] 1158 punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71 1159 punpcklwd m0, m7 ;00 10 20 30 40 50 60 70 1160 mova m7, [rsp+gprsize+16*0] 1161 jmp tx2q 1162 1163.pass2: 1164 lea tx2q, [o(m(idct_8x8_internal).end4)] 1165 1166.pass2_main: 1167 call .main 1168 1169.end: 1170 mova m7, [o(pw_2048)] 1171 REPX {pmulhrsw x, m7}, m0, m2, m4, m6 1172 mova [rsp+gprsize+16*1], m6 1173 1174.end2: 1175 REPX {pmulhrsw x, m7}, m1, m3, m5 1176 pmulhrsw m7, [rsp+gprsize+16*0] 1177 mova [rsp+gprsize+16*2], m5 1178 mova [rsp+gprsize+16*0], m7 1179 1180.end3: 1181 WRITE_8X4 0, 1, 2, 3, 5, 6, 7 1182 lea dstq, [dstq+strideq*2] 1183 WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7 1184 jmp tx2q 1185 1186.end4: 1187 pxor m7, m7 1188 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 1189 ret 1190 1191ALIGN function_align 1192.main: 1193 mova [rsp+gprsize*2+16*0], m7 1194 mova [rsp+gprsize*2+16*1], m3 1195 mova [rsp+gprsize*2+16*2], m1 1196 mova m7, [o(pd_2048)] 1197 IDCT4_1D 0, 2, 4, 6, 1, 3, 7 1198 mova m3, [rsp+gprsize*2+16*2] 1199 mova [rsp+gprsize*2+16*2], m2 1200 mova m2, [rsp+gprsize*2+16*1] 1201 mova [rsp+gprsize*2+16*1], m4 1202 mova m4, [rsp+gprsize*2+16*0] 1203 mova [rsp+gprsize*2+16*0], m6 1204 IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7 1205 mova m6, [rsp+gprsize*2+16*0] 1206 psubsw m7, m0, m4 ;out7 1207 paddsw m0, m4 ;out0 1208 mova [rsp+gprsize*2+16*0], m7 1209 mova m1, [rsp+gprsize*2+16*2] 1210 psubsw m4, m6, m3 ;out4 1211 paddsw m3, m6 ;out3 1212 mova m7, [rsp+gprsize*2+16*1] 1213 psubsw m6, m1, m5 ;out6 1214 paddsw m1, m5 ;out1 1215 psubsw m5, m7, m2 ;out5 1216 paddsw m2, m7 ;out2 1217 ret 1218 1219 1220INV_TXFM_8X8_FN adst, dct 1221INV_TXFM_8X8_FN adst, adst 1222INV_TXFM_8X8_FN adst, flipadst 1223INV_TXFM_8X8_FN adst, identity 1224 1225cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 1226 LOAD_8ROWS coeffq, 16 1227 1228.pass1: 1229 call .main 1230 call .main_pass1_end 1231 1232.pass1_end: 1233 mova m7, [o(pw_16384)] 1234 1235.pass1_end1: 1236 REPX {pmulhrsw x, m7}, m0, m2, m4, m6 1237 mova [rsp+gprsize+16*1], m6 1238 pxor m6, m6 1239 psubw m6, m7 1240 mova m7, m6 1241 jmp m(idct_8x8_internal).pass1_end2 1242 1243ALIGN function_align 1244.pass2: 1245 lea tx2q, [o(m(idct_8x8_internal).end4)] 1246 1247.pass2_main: 1248 call .main 1249 call .main_pass2_end 1250 1251.end: 1252 mova m7, [o(pw_2048)] 1253 REPX {pmulhrsw x, m7}, m0, m2, m4, m6 1254 mova [rsp+gprsize+16*1], m6 1255 pxor m6, m6 1256 psubw m6, m7 1257 mova m7, m6 1258 jmp m(idct_8x8_internal).end2 1259 1260ALIGN function_align 1261.main: 1262 mova [rsp+gprsize*2+16*0], m7 1263 mova [rsp+gprsize*2+16*1], m3 1264 mova [rsp+gprsize*2+16*2], m4 1265 mova m7, [o(pd_2048)] 1266 ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a 1267 ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a 1268 paddsw m3, m2, m6 ;t2 1269 psubsw m2, m6 ;t6 1270 paddsw m4, m5, m1 ;t3 1271 psubsw m5, m1 ;t7 1272 ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a 1273 1274 mova m6, [rsp+gprsize*2+16*2] 1275 mova [rsp+gprsize*2+16*2], m5 1276 mova m1, [rsp+gprsize*2+16*1] 1277 mova [rsp+gprsize*2+16*1], m2 1278 mova m5, [rsp+gprsize*2+16*0] 1279 mova [rsp+gprsize*2+16*0], m3 1280 ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a 1281 ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a 1282 psubsw m2, m0, m6 ;t4 1283 paddsw m0, m6 ;t0 1284 paddsw m3, m5, m1 ;t1 1285 psubsw m5, m1 ;t5 1286 ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a 1287 1288 mova m7, [rsp+gprsize*2+16*0] 1289 paddsw m1, m3, m4 ;-out7 1290 psubsw m3, m4 ;t3 1291 mova [rsp+gprsize*2+16*0], m1 1292 psubsw m4, m0, m7 ;t2 1293 paddsw m0, m7 ;out0 1294 mova m6, [rsp+gprsize*2+16*2] 1295 mova m7, [rsp+gprsize*2+16*1] 1296 paddsw m1, m5, m6 ;-out1 1297 psubsw m5, m6 ;t6 1298 paddsw m6, m2, m7 ;out6 1299 psubsw m2, m7 ;t7 1300 ret 1301ALIGN function_align 1302.main_pass1_end: 1303 mova [rsp+gprsize*2+16*1], m1 1304 mova [rsp+gprsize*2+16*2], m6 1305 punpckhwd m1, m4, m3 1306 punpcklwd m4, m3 1307 punpckhwd m7, m5, m2 1308 punpcklwd m5, m2 1309 mova m2, [o(pw_2896_2896)] 1310 mova m6, [o(pd_2048)] 1311 pmaddwd m3, m2, m7 1312 pmaddwd m2, m5 1313 paddd m3, m6 1314 paddd m2, m6 1315 psrad m3, 12 1316 psrad m2, 12 1317 packssdw m2, m3 ;out2 1318 mova m3, [o(pw_2896_m2896)] 1319 pmaddwd m7, m3 1320 pmaddwd m5, m3 1321 paddd m7, m6 1322 paddd m5, m6 1323 psrad m7, 12 1324 psrad m5, 12 1325 packssdw m5, m7 ;-out5 1326 mova m3, [o(pw_2896_2896)] 1327 pmaddwd m7, m3, m1 1328 pmaddwd m3, m4 1329 paddd m7, m6 1330 paddd m3, m6 1331 psrad m7, 12 1332 psrad m3, 12 1333 packssdw m3, m7 ;-out3 1334 mova m7, [o(pw_2896_m2896)] 1335 pmaddwd m1, m7 1336 pmaddwd m4, m7 1337 paddd m1, m6 1338 paddd m4, m6 1339 psrad m1, 12 1340 psrad m4, 12 1341 packssdw m4, m1 ;-out5 1342 mova m1, [rsp+gprsize*2+16*1] 1343 mova m6, [rsp+gprsize*2+16*2] 1344 ret 1345ALIGN function_align 1346.main_pass2_end: 1347 paddsw m7, m4, m3 ;t2 + t3 1348 psubsw m4, m3 ;t2 - t3 1349 paddsw m3, m5, m2 ;t6 + t7 1350 psubsw m5, m2 ;t6 - t7 1351 mova m2, [o(pw_2896x8)] 1352 pmulhrsw m4, m2 ;out4 1353 pmulhrsw m5, m2 ;-out5 1354 pmulhrsw m7, m2 ;-out3 1355 pmulhrsw m2, m3 ;out2 1356 mova m3, m7 1357 ret 1358 1359INV_TXFM_8X8_FN flipadst, dct 1360INV_TXFM_8X8_FN flipadst, adst 1361INV_TXFM_8X8_FN flipadst, flipadst 1362INV_TXFM_8X8_FN flipadst, identity 1363 1364cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 1365 LOAD_8ROWS coeffq, 16 1366 1367.pass1: 1368 call m(iadst_8x8_internal).main 1369 call m(iadst_8x8_internal).main_pass1_end 1370 1371.pass1_end: 1372 mova m7, [o(pw_m16384)] 1373 1374.pass1_end1: 1375 pmulhrsw m1, m7 1376 mova [rsp+gprsize+16*1], m1 1377 mova m1, m6 1378 mova m6, m2 1379 pmulhrsw m2, m5, m7 1380 mova m5, m6 1381 mova m6, m4 1382 pmulhrsw m4, m3, m7 1383 mova m3, m6 1384 mova m6, m0 1385 mova m0, m7 1386 pxor m7, m7 1387 psubw m7, m0 1388 pmulhrsw m0, [rsp+gprsize+16*0] 1389 REPX {pmulhrsw x, m7}, m1, m3, m5 1390 pmulhrsw m7, m6 1391 jmp m(idct_8x8_internal).pass1_end3 1392 1393ALIGN function_align 1394.pass2: 1395 lea tx2q, [o(m(idct_8x8_internal).end4)] 1396 1397.pass2_main: 1398 call m(iadst_8x8_internal).main 1399 call m(iadst_8x8_internal).main_pass2_end 1400 1401.end: 1402 mova m7, [o(pw_2048)] 1403 REPX {pmulhrsw x, m7}, m0, m2, m4, m6 1404 mova [rsp+gprsize+16*2], m2 1405 mova m2, m0 1406 pxor m0, m0 1407 psubw m0, m7 1408 mova m7, m2 1409 pmulhrsw m1, m0 1410 pmulhrsw m2, m5, m0 1411 mova [rsp+gprsize+16*1], m1 1412 mova m5, m4 1413 mova m1, m6 1414 pmulhrsw m4, m3, m0 1415 pmulhrsw m0, [rsp+gprsize+16*0] 1416 mova m3, m5 1417 mova [rsp+gprsize+16*0], m7 1418 jmp m(idct_8x8_internal).end3 1419 1420INV_TXFM_8X8_FN identity, dct 1421INV_TXFM_8X8_FN identity, adst 1422INV_TXFM_8X8_FN identity, flipadst 1423INV_TXFM_8X8_FN identity, identity 1424 1425cglobal iidentity_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 1426 LOAD_8ROWS coeffq, 16 1427 mova [rsp+gprsize+16*1], m6 1428 jmp m(idct_8x8_internal).pass1_end3 1429 1430ALIGN function_align 1431.pass2: 1432 lea tx2q, [o(m(idct_8x8_internal).end4)] 1433 1434.end: 1435 pmulhrsw m7, [o(pw_4096)] 1436 mova [rsp+gprsize+16*0], m7 1437 mova m7, [o(pw_4096)] 1438 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1439 mova [rsp+gprsize+16*2], m5 1440 mova [rsp+gprsize+16*1], m6 1441 jmp m(idct_8x8_internal).end3 1442 1443 1444%macro INV_TXFM_4X16_FN 2 ; type1, type2 1445 INV_TXFM_FN %1, %2, 4x16, 8 1446%ifidn %1_%2, dct_dct 1447 pshuflw m0, [coeffq], q0000 1448 punpcklwd m0, m0 1449 mova m1, [o(pw_2896x8)] 1450 pmulhrsw m0, m1 1451 mov [coeffq], eobd 1452 pmulhrsw m0, [o(pw_16384)] 1453 pmulhrsw m0, m1 1454 pmulhrsw m0, [o(pw_2048)] 1455.end: 1456 WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 1457 lea dstq, [dstq+strideq*4] 1458 WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 1459 lea dstq, [dstq+strideq*4] 1460 WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 1461 lea dstq, [dstq+strideq*4] 1462 WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 1463 RET 1464%endif 1465%endmacro 1466 1467INV_TXFM_4X16_FN dct, dct 1468INV_TXFM_4X16_FN dct, adst 1469INV_TXFM_4X16_FN dct, flipadst 1470INV_TXFM_4X16_FN dct, identity 1471 1472cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 1473 lea r3, [o(m(idct_4x8_internal).pass1)] 1474 1475.pass1: 1476 mova m0, [coeffq+16*1] 1477 mova m1, [coeffq+16*3] 1478 mova m2, [coeffq+16*5] 1479 mova m3, [coeffq+16*7] 1480 push tx2q 1481 lea tx2q, [o(m(idct_4x16_internal).pass1_2)] 1482 jmp r3 1483 1484.pass1_2: 1485 mova [coeffq+16*1], m0 1486 mova [coeffq+16*3], m1 1487 mova [coeffq+16*5], m2 1488 mova [coeffq+16*7], m3 1489 mova m0, [coeffq+16*0] 1490 mova m1, [coeffq+16*2] 1491 mova m2, [coeffq+16*4] 1492 mova m3, [coeffq+16*6] 1493 lea tx2q, [o(m(idct_4x16_internal).pass1_end)] 1494 jmp r3 1495 1496.pass1_end: 1497 pop tx2q 1498 1499 mova m4, [coeffq+16*1] 1500 mova m5, [coeffq+16*3] 1501 mova m6, [coeffq+16*5] 1502 mova m7, [o(pw_16384)] 1503 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1504 1505 pmulhrsw m7, [coeffq+16*7] 1506 mova [coeffq+16*7], m7 1507 jmp tx2q 1508 1509.pass2: 1510 call m(idct_16x4_internal).main 1511 1512.end: 1513 mova m7, [o(pw_2048)] 1514 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1515 pmulhrsw m7, [coeffq+16*7] 1516 mova [coeffq+16*4], m4 1517 1518.end1: 1519 mova [coeffq+16*5], m5 1520 mova [coeffq+16*6], m6 1521 mov r3, coeffq 1522 WRITE_4X8 0, 1, 3, 2 1523 1524 mova m0, [r3+16*4] 1525 mova m1, [r3+16*5] 1526 mova m2, [r3+16*6] 1527 mova m3, m7 1528 lea dstq, [dstq+strideq*4] 1529 WRITE_4X8 0, 1, 3, 2 1530 1531.end2: 1532 pxor m7, m7 1533 REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 1534 ret 1535 1536INV_TXFM_4X16_FN adst, dct 1537INV_TXFM_4X16_FN adst, adst 1538INV_TXFM_4X16_FN adst, flipadst 1539INV_TXFM_4X16_FN adst, identity 1540 1541cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 1542 lea r3, [o(m(iadst_4x8_internal).pass1)] 1543 jmp m(idct_4x16_internal).pass1 1544 1545.pass2: 1546 call m(iadst_16x4_internal).main 1547 call m(iadst_16x4_internal).main_pass2_end 1548 1549 punpcklqdq m6, m5, m4 ;low: -out5 high: -out7 1550 punpckhqdq m4, m5 ;low: out8 high: out10 1551 punpcklqdq m5, m7, m2 ;low: out4 high: out6 1552 punpckhqdq m2, m7 ;low: -out9 high: -out11 1553 mova [coeffq+16*4], m2 1554 mova [coeffq+16*5], m6 1555 mova m2, [coeffq+16*6] 1556 mova m6, [coeffq+16*7] 1557 punpckhqdq m1, m6, m0 ;low: -out13 high: -out15 1558 punpcklqdq m0, m6 ;low: out0 high: out2 1559 punpckhqdq m6, m3, m2 ;low: out12 high: out14 1560 punpcklqdq m2, m3 ;low: -out1 high: -out3 1561 1562 mova m7, [o(pw_2048)] 1563 1564.end1: 1565 REPX {pmulhrsw x, m7}, m0, m5, m4, m6 1566 pxor m3, m3 1567 psubw m3, m7 1568 mova m7, [coeffq+16*4] 1569 REPX {pmulhrsw x, m3}, m2, m7, m1 1570 pmulhrsw m3, [coeffq+16*5] 1571 mova [coeffq+16*7], m5 1572 1573 punpckhqdq m5, m4, m7 ;low: out10 high: out11 1574 punpcklqdq m4, m7 ;low: out8 high: out9 1575 punpckhqdq m7, m6, m1 ;low: out14 high: out15 1576 punpcklqdq m6, m1 ;low: out12 high: out13 1577 punpckhqdq m1, m0, m2 ;low: out2 high: out3 1578 punpcklqdq m0, m2 ;low: out0 high: out1 1579 mova [coeffq+16*4], m4 1580 mova m4, [coeffq+16*7] 1581 punpcklqdq m2, m4, m3 ;low: out4 high: out5 1582 punpckhqdq m4, m3 ;low: out6 high: out7 1583 mova m3, m4 1584 1585.end2: 1586 mova [coeffq+16*5], m5 1587 mova [coeffq+16*6], m6 1588 mov r3, coeffq 1589 WRITE_4X8 0, 1, 2, 3 1590 1591 mova m0, [r3+16*4] 1592 mova m1, [r3+16*5] 1593 mova m2, [r3+16*6] 1594 mova m3, m7 1595 lea dstq, [dstq+strideq*4] 1596 WRITE_4X8 0, 1, 2, 3 1597 1598.end3: 1599 pxor m7, m7 1600 REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 1601 ret 1602 1603 1604INV_TXFM_4X16_FN flipadst, dct 1605INV_TXFM_4X16_FN flipadst, adst 1606INV_TXFM_4X16_FN flipadst, flipadst 1607INV_TXFM_4X16_FN flipadst, identity 1608 1609cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 1610 lea r3, [o(m(iflipadst_4x8_internal).pass1)] 1611 jmp m(idct_4x16_internal).pass1 1612 1613.pass2: 1614 call m(iadst_16x4_internal).main 1615 call m(iadst_16x4_internal).main_pass2_end 1616 1617 punpckhqdq m6, m5, m4 ;low: out5 high: out7 1618 punpcklqdq m4, m5 ;low: -out8 high: -out10 1619 punpckhqdq m5, m7, m2 ;low: -out4 high: -out6 1620 punpcklqdq m2, m7 ;low: out9 high: out11 1621 mova [coeffq+16*4], m2 1622 mova [coeffq+16*5], m6 1623 mova m2, [coeffq+16*6] 1624 mova m6, [coeffq+16*7] 1625 punpcklqdq m1, m6, m0 ;low: out13 high: out15 1626 punpckhqdq m0, m6 ;low: -out0 high: -out2 1627 punpcklqdq m6, m3, m2 ;low: -out12 high: -out14 1628 punpckhqdq m2, m3 ;low: out1 high: out3 1629 1630 mova m7, [o(pw_m2048)] 1631 jmp m(iadst_4x16_internal).end1 1632 1633 1634INV_TXFM_4X16_FN identity, dct 1635INV_TXFM_4X16_FN identity, adst 1636INV_TXFM_4X16_FN identity, flipadst 1637INV_TXFM_4X16_FN identity, identity 1638 1639%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] 1640 pmulhrsw m%2, m%3, m%1 1641%if %0 == 4 ; if downshifting by 1 1642 pmulhrsw m%2, m%4 1643%else 1644 paddsw m%1, m%1 1645%endif 1646 paddsw m%1, m%2 1647%endmacro 1648 1649cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 1650 mova m0, [coeffq+16*1] 1651 mova m6, [o(pw_1697x8)] 1652 mova m1, [coeffq+16*3] 1653 mova m2, [coeffq+16*5] 1654 mova m3, [coeffq+16*7] 1655 pcmpeqw m7, m7 1656 mov r3, tx2q 1657 lea tx2q, [o(.pass1_2)] 1658.pass1: 1659 pmulhrsw m4, m6, m0 1660 pmulhrsw m5, m6, m1 1661 pavgw m4, m0 1662 pcmpeqw m0, m7 1663 pavgw m5, m1 1664 pcmpeqw m1, m7 1665 pandn m0, m4 1666 pmulhrsw m4, m6, m2 1667 pandn m1, m5 1668 pmulhrsw m5, m6, m3 1669 pavgw m4, m2 1670 pcmpeqw m2, m7 1671 pavgw m5, m3 1672 pcmpeqw m3, m7 1673 pandn m2, m4 1674 pandn m3, m5 1675 jmp m(iadst_4x8_internal).pass1_end 1676.pass1_2: 1677 mova [coeffq+16*1], m0 1678 mova [coeffq+16*3], m1 1679 mova [coeffq+16*5], m2 1680 mova [coeffq+16*7], m3 1681 mova m0, [coeffq+16*0] 1682 mova m1, [coeffq+16*2] 1683 mova m2, [coeffq+16*4] 1684 mova m3, [coeffq+16*6] 1685 lea tx2q, [o(.pass1_end)] 1686 jmp .pass1 1687.pass1_end: 1688 mova m4, [coeffq+16*1] 1689 mova m5, [coeffq+16*3] 1690 mova m6, [coeffq+16*5] 1691 jmp r3 1692.pass2: 1693 mova m7, [o(pw_1697x16)] 1694 mova [coeffq+16*6], m6 1695 REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 1696 mova m6, [coeffq+16*7] 1697 IDTX16 6, 7, 7 1698 mova [coeffq+16*7], m6 1699 mova m6, [coeffq+16*6] 1700 pmulhrsw m7, m6, [o(pw_1697x16)] 1701 paddsw m6, m6 1702 paddsw m6, m7 1703 mova m7, [o(pw_2048)] 1704 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1705 pmulhrsw m7, [coeffq+16*7] 1706 mova [coeffq+16*4], m4 1707 jmp m(iadst_4x16_internal).end2 1708 1709 1710%macro INV_TXFM_16X4_FN 2 ; type1, type2 1711 INV_TXFM_FN %1, %2, 16x4, 8 1712%ifidn %1_%2, dct_dct 1713 movd m1, [o(pw_2896x8)] 1714 pmulhrsw m0, m1, [coeffq] 1715 movd m2, [o(pw_16384)] 1716 mov [coeffq], eobd 1717 mov r2d, 2 1718 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4).end)] 1719.dconly: 1720 pmulhrsw m0, m2 1721 movd m2, [o(pw_2048)] ;intentionally rip-relative 1722 pmulhrsw m0, m1 1723 pmulhrsw m0, m2 1724 pshuflw m0, m0, q0000 1725 punpcklwd m0, m0 1726 pxor m5, m5 1727.dconly_loop: 1728 mova m1, [dstq] 1729 mova m3, [dstq+strideq] 1730 punpckhbw m2, m1, m5 1731 punpcklbw m1, m5 1732 punpckhbw m4, m3, m5 1733 punpcklbw m3, m5 1734 paddw m2, m0 1735 paddw m1, m0 1736 paddw m4, m0 1737 paddw m3, m0 1738 packuswb m1, m2 1739 packuswb m3, m4 1740 mova [dstq], m1 1741 mova [dstq+strideq], m3 1742 lea dstq, [dstq+strideq*2] 1743 dec r2d 1744 jg .dconly_loop 1745 jmp tx2q 1746.end: 1747 RET 1748%endif 1749%endmacro 1750 1751%macro LOAD_7ROWS 2 ;src, stride 1752 mova m0, [%1+%2*0] 1753 mova m1, [%1+%2*1] 1754 mova m2, [%1+%2*2] 1755 mova m3, [%1+%2*3] 1756 mova m4, [%1+%2*4] 1757 mova m5, [%1+%2*5] 1758 mova m6, [%1+%2*6] 1759%endmacro 1760 1761%macro SAVE_7ROWS 2 ;src, stride 1762 mova [%1+%2*0], m0 1763 mova [%1+%2*1], m1 1764 mova [%1+%2*2], m2 1765 mova [%1+%2*3], m3 1766 mova [%1+%2*4], m4 1767 mova [%1+%2*5], m5 1768 mova [%1+%2*6], m6 1769%endmacro 1770 1771%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3] 1772 punpckhwd m%5, m%4, m%1 ;packed in13 in3 1773 punpcklwd m%1, m%4 ;packed in1 in15 1774 punpcklwd m%4, m%3, m%2 ;packed in9 in7 1775 punpckhwd m%2, m%3 ;packed in5 in11 1776 mova m%7, [o(pd_2048)] 1777 ITX_MUL2X_PACK %1, %6, %7, 401, 4076, 1 ;low: t8a high: t15a 1778 ITX_MUL2X_PACK %4, %6, %7, 3166, 2598, 1 ;low: t9a high: t14a 1779 ITX_MUL2X_PACK %2, %6, %7, 1931, 3612, 1 ;low: t10a high: t13a 1780 ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a 1781 psubsw m%6, m%1, m%4 ;low: t9 high: t14 1782 paddsw m%1, m%4 ;low: t8 high: t15 1783 psubsw m%4, m%5, m%2 ;low: t10 high: t13 1784 paddsw m%5, m%2 ;low: t11 high: t12 1785 mova m%2, [o(deint_shuf2)] 1786 pshufb m%6, m%2 1787 pshufb m%4, m%2 1788 ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a 1789 ITX_MUL2X_PACK %4, %3, %7, m3784, 1567, 1 ;low: t10a high: t13a 1790 psubsw m%3, m%1, m%5 ;low: t11a high: t12a 1791 paddsw m%1, m%5 ;low: t8a high: t15a 1792 psubsw m%5, m%6, m%4 ;low: t10 high: t13 1793 paddsw m%6, m%4 ;low: t9 high: t14 1794 pshufb m%3, m%2 1795 pshufb m%5, m%2 1796 ITX_MUL2X_PACK %3, %2, %7, 2896, 2896, 4 ;t12, t11 1797 ITX_MUL2X_PACK %5, %4, %7, 2896, 2896, 4 ;t13a, t10a 1798 packssdw m%2, m%4 ;low: t11 high: t10a 1799 packssdw m%3, m%5 ;low: t12 high: t13a 1800 punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14 1801 punpcklqdq m%1, m%6 ;low: t8a high: t9 1802%endmacro 1803 1804INV_TXFM_16X4_FN dct, dct 1805INV_TXFM_16X4_FN dct, adst 1806INV_TXFM_16X4_FN dct, flipadst 1807INV_TXFM_16X4_FN dct, identity 1808 1809cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 1810 LOAD_7ROWS coeffq, 16 1811 call .main 1812 1813.pass1_end: 1814 punpckhwd m7, m0, m2 ;packed out1, out5 1815 punpcklwd m0, m2 ;packed out0, out4 1816 punpcklwd m2, m1, m3 ;packed out3, out7 1817 punpckhwd m1, m3 ;packed out2, out6 1818 mova [coeffq+16*6], m7 1819 mova m7, [coeffq+16*7] 1820 punpckhwd m3, m4, m6 ;packed out9, out13 1821 punpcklwd m4, m6 ;packed out8, out12 1822 punpcklwd m6, m5, m7 ;packed out11, out15 1823 punpckhwd m5, m7 ;packed out10, out14 1824 1825.pass1_end2: 1826 mova m7, [o(pw_16384)] 1827 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1828 pmulhrsw m7, [coeffq+16*6] 1829 mova [coeffq+16*6], m7 1830 1831.pass1_end3: 1832 punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high 1833 punpcklwd m3, m6 ;packed 9, 10, 13, 15 low 1834 punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high 1835 punpcklwd m4, m5 ;packed 8, 10, 12, 14 low 1836 punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1) 1837 punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0) 1838 punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3) 1839 punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2) 1840 mova [coeffq+16*7], m3 1841 mova m3, [coeffq+16*6] 1842 punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high 1843 punpcklwd m3, m2 ;packed 1, 3, 5, 7 low 1844 punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high 1845 punpcklwd m0, m1 ;packed 0, 2, 4, 6 low 1846 punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1) 1847 punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0) 1848 punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3) 1849 punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2) 1850 jmp tx2q 1851 1852.pass2: 1853 lea tx2q, [o(m(idct_8x4_internal).pass2)] 1854 1855.pass2_end: 1856 mova [coeffq+16*4], m4 1857 mova [coeffq+16*5], m5 1858 mova [coeffq+16*6], m6 1859 lea r3, [dstq+8] 1860 call tx2q 1861 1862 add coeffq, 16*4 1863 mova m0, [coeffq+16*0] 1864 mova m1, [coeffq+16*1] 1865 mova m2, [coeffq+16*2] 1866 mova m3, [coeffq+16*3] 1867 mov dstq, r3 1868 jmp tx2q 1869 1870ALIGN function_align 1871.main: 1872 punpckhqdq m7, m0, m1 ;low:in1 high:in3 1873 punpcklqdq m0, m1 1874 punpcklqdq m1, m2, m3 1875 punpckhqdq m3, m2 ;low:in7 high:in5 1876 mova [coeffq+16*4], m7 1877 mova [coeffq+16*5], m3 1878 mova m7, [coeffq+16*7] 1879 punpcklqdq m2, m4, m5 1880 punpckhqdq m4, m5 ;low:in9 high:in11 1881 punpcklqdq m3, m6, m7 1882 punpckhqdq m7, m6 ;low:in15 high:in13 1883 mova [coeffq+16*6], m4 1884 IDCT8_1D_PACKED 1885 mova m6, [coeffq+16*4] 1886 mova m4, [coeffq+16*5] 1887 mova m5, [coeffq+16*6] 1888 mova [coeffq+16*4], m1 1889 mova [coeffq+16*5], m2 1890 mova [coeffq+16*6], m3 1891 1892 IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3 1893 1894 mova m1, [coeffq+16*4] 1895 psubsw m3, m0, m7 ;low:out15 high:out14 1896 paddsw m0, m7 ;low:out0 high:out1 1897 psubsw m7, m1, m5 ;low:out12 high:out13 1898 paddsw m1, m5 ;low:out3 high:out2 1899 mova [coeffq+16*7], m3 1900 mova m2, [coeffq+16*5] 1901 mova m3, [coeffq+16*6] 1902 psubsw m5, m2, m4 ;low:out11 high:out10 1903 paddsw m2, m4 ;low:out4 high:out5 1904 psubsw m4, m3, m6 ;low:out8 high:out9 1905 paddsw m3, m6 ;low:out7 high:out6 1906 mova m6, m7 1907 ret 1908 1909INV_TXFM_16X4_FN adst, dct 1910INV_TXFM_16X4_FN adst, adst 1911INV_TXFM_16X4_FN adst, flipadst 1912INV_TXFM_16X4_FN adst, identity 1913 1914cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 1915 LOAD_7ROWS coeffq, 16 1916 call .main 1917 call .main_pass1_end 1918 1919 punpckhwd m6, m7, m0 ;packed -out11, -out15 1920 punpcklwd m0, m7 ;packed out0, out4 1921 punpcklwd m7, m3, m4 ;packed -out3, -out7 1922 punpckhwd m4, m3 ;packed out8, out12 1923 mova m1, [coeffq+16*6] 1924 punpcklwd m3, m1, m5 ;packed -out1, -out5 1925 punpckhwd m5, m1 ;packed out10, out14 1926 mova m1, [coeffq+16*7] 1927 mova [coeffq+16*6], m3 1928 mova [coeffq+16*7], m7 1929 punpckhwd m3, m2, m1 ;packed -out9, -out13 1930 punpcklwd m1, m2 ;packed out2, out6 1931 1932 mova m7, [o(pw_16384)] 1933 1934.pass1_end: 1935 REPX {pmulhrsw x, m7}, m0, m1, m4, m5 1936 pxor m2, m2 1937 psubw m2, m7 1938 mova m7, [coeffq+16*6] 1939 REPX {pmulhrsw x, m2}, m7, m3, m6 1940 pmulhrsw m2, [coeffq+16*7] 1941 mova [coeffq+16*6], m7 1942 jmp m(idct_16x4_internal).pass1_end3 1943 1944.pass2: 1945 lea tx2q, [o(m(iadst_8x4_internal).pass2)] 1946 jmp m(idct_16x4_internal).pass2_end 1947 1948ALIGN function_align 1949.main: 1950 mova [coeffq+16*6], m0 1951 pshufd m0, m1, q1032 1952 pshufd m2, m2, q1032 1953 punpckhwd m1, m6, m0 ;packed in13, in2 1954 punpcklwd m0, m6 ;packed in3, in12 1955 punpckhwd m7, m5, m2 ;packed in11, in4 1956 punpcklwd m2, m5 ;packed in5, in10 1957 mova m6, [o(pd_2048)] 1958 ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3 1959 ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5 1960 ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11 1961 ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13 1962 psubsw m5, m1, m2 ;low:t10a high:t11a 1963 paddsw m1, m2 ;low:t2a high:t3a 1964 psubsw m2, m7, m0 ;low:t12a high:t13a 1965 paddsw m7, m0 ;low:t4a high:t5a 1966 punpcklqdq m0, m5 1967 punpckhwd m0, m5 ;packed t10a, t11a 1968 punpcklqdq m5, m2 1969 punpckhwd m2, m5 ;packed t13a, t12a 1970 ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11 1971 ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13 1972 mova [coeffq+16*4], m1 1973 mova [coeffq+16*5], m7 1974 mova m1, [coeffq+16*6] 1975 mova m7, [coeffq+16*7] 1976 pshufd m1, m1, q1032 1977 pshufd m3, m3, q1032 1978 punpckhwd m5, m7, m1 ;packed in15, in0 1979 punpcklwd m1, m7 ;packed in1, in14 1980 punpckhwd m7, m4, m3 ;packed in9, in6 1981 punpcklwd m3, m4 ;packed in7, in8 1982 ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1 1983 ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7 1984 ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9 1985 ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15 1986 psubsw m4, m5, m3 ;low:t8a high:t9a 1987 paddsw m5, m3 ;low:t0a high:t1a 1988 psubsw m3, m7, m1 ;low:t14a high:t15a 1989 paddsw m7, m1 ;low:t6a high:t7a 1990 punpcklqdq m1, m4 1991 punpckhwd m1, m4 ;packed t8a, t9a 1992 punpcklqdq m4, m3 1993 punpckhwd m3, m4 ;packed t15a, t14a 1994 ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9 1995 ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15 1996 paddsw m4, m1, m2 ;low:t12a high:t13a 1997 psubsw m1, m2 ;low:t8a high:t9a 1998 psubsw m2, m0, m3 ;low:t14a high:t15a 1999 paddsw m0, m3 ;low:t10a high:t11a 2000 punpcklqdq m3, m1 2001 punpckhwd m3, m1 ;packed t12a, t13a 2002 punpcklqdq m1, m2 2003 punpckhwd m2, m1 ;packed t15a, t14a 2004 ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13 2005 ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15 2006 psubsw m1, m3, m2 ;low:t14a high:t15a 2007 paddsw m3, m2 ;low:out2 high:-out13 2008 psubsw m2, m4, m0 ;low:t10 high:t11 2009 paddsw m0, m4 ;low:-out1 high:out14 2010 mova [coeffq+16*6], m0 2011 mova [coeffq+16*7], m3 2012 mova m0, [coeffq+16*4] 2013 mova m3, [coeffq+16*5] 2014 psubsw m4, m5, m3 ;low:t4 high:t5 2015 paddsw m5, m3 ;low:t0 high:t1 2016 psubsw m3, m0, m7 ;low:t6 high:t7 2017 paddsw m0, m7 ;low:t2 high:t3 2018 punpcklqdq m7, m4 2019 punpckhwd m7, m4 ;packed t4, t5 2020 punpcklqdq m4, m3 2021 punpckhwd m3, m4 ;packed t7, t6 2022 ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a 2023 ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a 2024 psubsw m4, m5, m0 ;low:t2a high:t3a 2025 paddsw m0, m5 ;low:out0 high:-out15 2026 psubsw m5, m7, m3 ;low:t6 high:t7 2027 paddsw m3, m7 ;low:-out3 high:out12 2028 ret 2029ALIGN function_align 2030.main_pass1_end: 2031 mova m7, [o(deint_shuf1)] 2032 mova [coeffq+16*4], m0 2033 mova [coeffq+16*5], m3 2034 mova m0, [o(pw_2896_m2896)] 2035 mova m3, [o(pw_2896_2896)] 2036 pshufb m1, m7 ;t14a t15a 2037 pshufb m2, m7 ;t10 t11 2038 pshufb m4, m7 ;t2a t3a 2039 pshufb m5, m7 ;t6 t7 2040 pmaddwd m7, m0, m2 2041 pmaddwd m2, m3 2042 paddd m7, m6 2043 paddd m2, m6 2044 psrad m7, 12 2045 psrad m2, 12 2046 packssdw m2, m7 ;low:out6 high:-out9 2047 pmaddwd m7, m0, m4 2048 pmaddwd m4, m3 2049 paddd m7, m6 2050 paddd m4, m6 2051 psrad m7, 12 2052 psrad m4, 12 2053 packssdw m4, m7 ;low:-out7 high:out8 2054 pmaddwd m7, m3, m5 2055 pmaddwd m5, m0 2056 paddd m7, m6 2057 paddd m5, m6 2058 psrad m7, 12 2059 psrad m5, 12 2060 packssdw m7, m5 ;low:out4 high:-out11 2061 pmaddwd m5, m3, m1 2062 pmaddwd m1, m0 2063 paddd m5, m6 2064 paddd m1, m6 2065 psrad m5, 12 2066 psrad m1, 12 2067 packssdw m5, m1 ;low:-out5 high:out10 2068 mova m0, [coeffq+16*4] 2069 mova m3, [coeffq+16*5] 2070 ret 2071ALIGN function_align 2072.main_pass2_end: 2073 mova m7, [o(pw_2896x8)] 2074 punpckhqdq m6, m2, m1 ;low:t11 high:t15a 2075 punpcklqdq m2, m1 ;low:t10 high:t14a 2076 psubsw m1, m2, m6 2077 paddsw m2, m6 2078 punpckhqdq m6, m4, m5 ;low:t3a high:t7 2079 punpcklqdq m4, m5 ;low:t2a high:t6 2080 psubsw m5, m4, m6 2081 paddsw m4, m6 2082 pmulhrsw m1, m7 ;low:-out9 high:out10 2083 pmulhrsw m2, m7 ;low:out6 high:-out5 2084 pmulhrsw m5, m7 ;low:out8 high:-out11 2085 pmulhrsw m4, m7 ;low:-out7 high:out4 2086 punpckhqdq m7, m4, m5 ;low:out4 high:-out11 2087 punpcklqdq m4, m5 ;low:-out7 high:out8 2088 punpckhqdq m5, m2, m1 ;low:-out5 high:out10 2089 punpcklqdq m2, m1 ;low:out6 high:-out9 2090 ret 2091 2092 2093INV_TXFM_16X4_FN flipadst, dct 2094INV_TXFM_16X4_FN flipadst, adst 2095INV_TXFM_16X4_FN flipadst, flipadst 2096INV_TXFM_16X4_FN flipadst, identity 2097 2098cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 2099 LOAD_7ROWS coeffq, 16 2100 call m(iadst_16x4_internal).main 2101 call m(iadst_16x4_internal).main_pass1_end 2102 2103 punpcklwd m6, m7, m0 ;packed out11, out15 2104 punpckhwd m0, m7 ;packed -out0, -out4 2105 punpckhwd m7, m3, m4 ;packed out3, out7 2106 punpcklwd m4, m3 ;packed -out8, -out12 2107 mova m1, [coeffq+16*6] 2108 punpckhwd m3, m1, m5 ;packed out1, out5 2109 punpcklwd m5, m1 ;packed -out10, -out14 2110 mova m1, [coeffq+16*7] 2111 mova [coeffq+16*6], m3 2112 mova [coeffq+16*7], m7 2113 punpcklwd m3, m2, m1 ;packed out9, out13 2114 punpckhwd m1, m2 ;packed -out2, -out6 2115 2116 mova m7, [o(pw_m16384)] 2117 jmp m(iadst_16x4_internal).pass1_end 2118 2119.pass2: 2120 lea tx2q, [o(m(iflipadst_8x4_internal).pass2)] 2121 jmp m(idct_16x4_internal).pass2_end 2122 2123 2124INV_TXFM_16X4_FN identity, dct 2125INV_TXFM_16X4_FN identity, adst 2126INV_TXFM_16X4_FN identity, flipadst 2127INV_TXFM_16X4_FN identity, identity 2128 2129cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 2130 mova m1, [coeffq+16*6] 2131 mova m0, [coeffq+16*5] 2132 mova m2, [coeffq+16*7] 2133 mova m6, [o(pw_1697x16)] 2134 mova m7, [o(pw_16384)] 2135 pmulhrsw m4, m6, m1 2136 pmulhrsw m3, m6, m0 2137 pmulhrsw m5, m6, m2 2138 pmulhrsw m4, m7 2139 pmulhrsw m3, m7 2140 pmulhrsw m5, m7 2141 paddsw m1, m4 2142 paddsw m0, m3 2143 paddsw m5, m2 2144 mova m2, [coeffq+16*2] 2145 mova m3, [coeffq+16*3] 2146 mova m4, [coeffq+16*4] 2147 mova [coeffq+16*6], m1 2148 mova [coeffq+16*5], m0 2149 mova [coeffq+16*7], m5 2150 pmulhrsw m0, m6, m2 2151 pmulhrsw m1, m6, m3 2152 pmulhrsw m5, m6, m4 2153 pmulhrsw m0, m7 2154 pmulhrsw m1, m7 2155 pmulhrsw m5, m7 2156 paddsw m2, m0 2157 paddsw m3, m1 2158 paddsw m4, m5 2159 mova m0, [coeffq+16*0] 2160 mova m1, [coeffq+16*1] 2161 pmulhrsw m5, m6, m0 2162 pmulhrsw m6, m1 2163 pmulhrsw m5, m7 2164 pmulhrsw m6, m7 2165 paddsw m0, m5 2166 paddsw m1, m6 2167 mova m6, [coeffq+16*6] 2168 mova m5, [coeffq+16*5] 2169 punpckhwd m7, m0, m2 ;packed out1, out5 2170 punpcklwd m0, m2 ;packed out0, out4 2171 punpckhwd m2, m1, m3 ;packed out3, out7 2172 punpcklwd m1, m3 ;packed out2, out6 2173 mova [coeffq+16*6], m7 2174 mova m7, [coeffq+16*7] 2175 punpckhwd m3, m4, m6 ;packed out9, out13 2176 punpcklwd m4, m6 ;packed out8, out12 2177 punpckhwd m6, m5, m7 ;packed out11, out15 2178 punpcklwd m5, m7 ;packed out10, out14 2179 jmp m(idct_16x4_internal).pass1_end3 2180 2181.pass2: 2182 lea tx2q, [o(m(iidentity_8x4_internal).pass2)] 2183 jmp m(idct_16x4_internal).pass2_end 2184 2185 2186%macro SAVE_8ROWS 2 ;src, stride 2187 mova [%1+%2*0], m0 2188 mova [%1+%2*1], m1 2189 mova [%1+%2*2], m2 2190 mova [%1+%2*3], m3 2191 mova [%1+%2*4], m4 2192 mova [%1+%2*5], m5 2193 mova [%1+%2*6], m6 2194 mova [%1+%2*7], m7 2195%endmacro 2196 2197%macro INV_TXFM_8X16_FN 2 ; type1, type2 2198 INV_TXFM_FN %1, %2, 8x16, 8, 16*16 2199%ifidn %1_%2, dct_dct 2200 pshuflw m0, [coeffq], q0000 2201 punpcklwd m0, m0 2202 mova m1, [o(pw_2896x8)] 2203 pmulhrsw m0, m1 2204 mova m2, [o(pw_16384)] 2205 mov [coeffq], eobd 2206 pmulhrsw m0, m1 2207 pmulhrsw m0, m2 2208 psrlw m2, 3 ; pw_2048 2209 pmulhrsw m0, m1 2210 pmulhrsw m0, m2 2211 mov r3d, 4 2212 lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16).end)] 2213 jmp m(inv_txfm_add_dct_dct_8x8).loop 2214.end: 2215 RET 2216%endif 2217%endmacro 2218 2219INV_TXFM_8X16_FN dct, dct 2220INV_TXFM_8X16_FN dct, adst 2221INV_TXFM_8X16_FN dct, flipadst 2222INV_TXFM_8X16_FN dct, identity 2223 2224cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 2225 lea r3, [o(m(idct_8x8_internal).pass1)] 2226 2227.pass1: 2228 LOAD_8ROWS coeffq+16*1, 32, 1 2229 mov [rsp+gprsize+16*11], tx2q 2230 lea tx2q, [o(m(idct_8x16_internal).pass1_end)] 2231 jmp r3 2232 2233.pass1_end: 2234 SAVE_8ROWS coeffq+16*1, 32 2235 LOAD_8ROWS coeffq+16*0, 32, 1 2236 mov tx2q, [rsp+gprsize+16*11] 2237 jmp r3 2238 2239.pass2: 2240 lea tx2q, [o(m(idct_8x16_internal).end)] 2241 2242.pass2_pre: 2243 mova [coeffq+16*2 ], m1 2244 mova [coeffq+16*6 ], m3 2245 mova [coeffq+16*10], m5 2246 mova [coeffq+16*14], m7 2247 mova m1, m2 2248 mova m2, m4 2249 mova m3, m6 2250 mova m4, [coeffq+16*1 ] 2251 mova m5, [coeffq+16*5 ] 2252 mova m6, [coeffq+16*9 ] 2253 mova m7, [coeffq+16*13] 2254 2255.pass2_main: 2256 call m(idct_8x8_internal).main 2257 2258 SAVE_7ROWS rsp+gprsize+16*3, 16 2259 mova m0, [coeffq+16*2 ] 2260 mova m1, [coeffq+16*6 ] 2261 mova m2, [coeffq+16*10] 2262 mova m3, [coeffq+16*14] 2263 mova m4, [coeffq+16*3 ] 2264 mova m5, [coeffq+16*7 ] 2265 mova m6, [coeffq+16*11] 2266 mova m7, [coeffq+16*15] 2267 call m(idct_16x8_internal).main 2268 2269 mov r3, dstq 2270 lea dstq, [dstq+strideq*8] 2271 jmp m(idct_8x8_internal).end 2272 2273.end: 2274 LOAD_8ROWS rsp+gprsize+16*3, 16 2275 mova [rsp+gprsize+16*0], m7 2276 lea tx2q, [o(m(idct_8x16_internal).end1)] 2277 mov dstq, r3 2278 jmp m(idct_8x8_internal).end 2279 2280.end1: 2281 pxor m7, m7 2282 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 2283 ret 2284 2285INV_TXFM_8X16_FN adst, dct 2286INV_TXFM_8X16_FN adst, adst 2287INV_TXFM_8X16_FN adst, flipadst 2288INV_TXFM_8X16_FN adst, identity 2289 2290cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 2291 lea r3, [o(m(iadst_8x8_internal).pass1)] 2292 jmp m(idct_8x16_internal).pass1 2293 2294.pass2: 2295 lea tx2q, [o(m(iadst_8x16_internal).end)] 2296 2297.pass2_pre: 2298 mova [rsp+gprsize+16*7], m0 2299 mova [rsp+gprsize+16*8], m1 2300 mova [rsp+gprsize+16*5], m6 2301 mova [rsp+gprsize+16*6], m7 2302 mova m0, m2 2303 mova m1, m3 2304 mova m2, m4 2305 mova m3, m5 2306 2307.pass2_main: 2308 mova m4, [coeffq+16*1 ] 2309 mova m5, [coeffq+16*3 ] 2310 mova m6, [coeffq+16*13] 2311 mova m7, [coeffq+16*15] 2312 mova [rsp+gprsize+16*3], m4 2313 mova [rsp+gprsize+16*4], m5 2314 mova [rsp+gprsize+16*9], m6 2315 mova [rsp+gprsize+32*5], m7 2316 mova m4, [coeffq+16*5 ] 2317 mova m5, [coeffq+16*7 ] 2318 mova m6, [coeffq+16*9 ] 2319 mova m7, [coeffq+16*11] 2320 2321 call m(iadst_16x8_internal).main 2322 call m(iadst_16x8_internal).main_pass2_end 2323 2324 mov r3, dstq 2325 lea dstq, [dstq+strideq*8] 2326 jmp m(iadst_8x8_internal).end 2327 2328.end: 2329 LOAD_8ROWS rsp+gprsize+16*3, 16 2330 mova [rsp+gprsize+16*0], m7 2331 lea tx2q, [o(m(idct_8x16_internal).end1)] 2332 mov dstq, r3 2333 jmp m(iadst_8x8_internal).end 2334 2335 2336INV_TXFM_8X16_FN flipadst, dct 2337INV_TXFM_8X16_FN flipadst, adst 2338INV_TXFM_8X16_FN flipadst, flipadst 2339INV_TXFM_8X16_FN flipadst, identity 2340 2341cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 2342 lea r3, [o(m(iflipadst_8x8_internal).pass1)] 2343 jmp m(idct_8x16_internal).pass1 2344 2345.pass2: 2346 lea tx2q, [o(m(iflipadst_8x16_internal).end)] 2347 lea r3, [dstq+strideq*8] 2348 2349.pass2_pre: 2350 mova [rsp+gprsize+16*7], m0 2351 mova [rsp+gprsize+16*8], m1 2352 mova [rsp+gprsize+16*5], m6 2353 mova [rsp+gprsize+16*6], m7 2354 mova m0, m2 2355 mova m1, m3 2356 mova m2, m4 2357 mova m3, m5 2358 2359.pass2_main: 2360 mova m4, [coeffq+16*1 ] 2361 mova m5, [coeffq+16*3 ] 2362 mova m6, [coeffq+16*13] 2363 mova m7, [coeffq+16*15] 2364 mova [rsp+gprsize+16*3], m4 2365 mova [rsp+gprsize+16*4], m5 2366 mova [rsp+gprsize+16*9], m6 2367 mova [rsp+gprsize+32*5], m7 2368 mova m4, [coeffq+16*5 ] 2369 mova m5, [coeffq+16*7 ] 2370 mova m6, [coeffq+16*9 ] 2371 mova m7, [coeffq+16*11] 2372 2373 call m(iadst_16x8_internal).main 2374 call m(iadst_16x8_internal).main_pass2_end 2375 jmp m(iflipadst_8x8_internal).end 2376 2377.end: 2378 LOAD_8ROWS rsp+gprsize+16*3, 16 2379 mova [rsp+gprsize+16*0], m7 2380 lea tx2q, [o(m(idct_8x16_internal).end1)] 2381 mov dstq, r3 2382 jmp m(iflipadst_8x8_internal).end 2383 2384 2385INV_TXFM_8X16_FN identity, dct 2386INV_TXFM_8X16_FN identity, adst 2387INV_TXFM_8X16_FN identity, flipadst 2388INV_TXFM_8X16_FN identity, identity 2389 2390cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 2391 LOAD_8ROWS coeffq+16*1, 32, 1 2392 mov r3, tx2q 2393 lea tx2q, [o(m(iidentity_8x16_internal).pass1_end)] 2394 mova [rsp+gprsize+16*1], m6 2395 jmp m(idct_8x8_internal).pass1_end3 2396 2397.pass1_end: 2398 SAVE_8ROWS coeffq+16*1, 32 2399 LOAD_8ROWS coeffq+16*0, 32, 1 2400 mov tx2q, r3 2401 mova [rsp+gprsize+16*1], m6 2402 jmp m(idct_8x8_internal).pass1_end3 2403 2404.pass2: 2405 lea tx2q, [o(m(iidentity_8x16_internal).end1)] 2406 2407.end: 2408 mova [rsp+gprsize+16*0], m7 2409 mova [rsp+gprsize+16*1], m6 2410 mova m7, [o(pw_1697x16)] 2411 REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 2412 mova m6, [rsp+gprsize+16*1] 2413 mova [rsp+gprsize+16*2], m5 2414 IDTX16 6, 5, 7 2415 mova m5, [rsp+gprsize+16*0] 2416 IDTX16 5, 7, 7 2417 mova m7, [o(pw_2048)] 2418 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 2419 pmulhrsw m7, [rsp+gprsize+16*2] 2420 mova [rsp+gprsize+16*0], m5 2421 mova [rsp+gprsize+16*1], m6 2422 mova [rsp+gprsize+16*2], m7 2423 jmp m(idct_8x8_internal).end3 2424 2425.end1: 2426 LOAD_8ROWS coeffq+16*1, 32 2427 lea tx2q, [o(m(idct_8x16_internal).end1)] 2428 lea dstq, [dstq+strideq*2] 2429 jmp .end 2430 2431 2432%macro INV_TXFM_16X8_FN 2 ; type1, type2 2433 INV_TXFM_FN %1, %2, 16x8, 8, 16*16 2434%ifidn %1_%2, dct_dct 2435 movd m1, [o(pw_2896x8)] 2436 pmulhrsw m0, m1, [coeffq] 2437 movd m2, [o(pw_16384)] 2438 mov [coeffq], eobd 2439 pmulhrsw m0, m1 2440 mov r2d, 4 2441 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8).end)] 2442 jmp m(inv_txfm_add_dct_dct_16x4).dconly 2443.end: 2444 RET 2445%endif 2446%endmacro 2447 2448INV_TXFM_16X8_FN dct, dct 2449INV_TXFM_16X8_FN dct, adst 2450INV_TXFM_16X8_FN dct, flipadst 2451INV_TXFM_16X8_FN dct, identity 2452 2453cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 2454 LOAD_8ROWS coeffq+16*0, 32, 1 2455 call m(idct_8x8_internal).main 2456 SAVE_7ROWS rsp+gprsize+16*3, 16 2457 2458 LOAD_8ROWS coeffq+16*1, 32, 1 2459 call .main 2460 mov r3, tx2q 2461 lea tx2q, [o(m(idct_16x8_internal).pass1_end)] 2462 jmp m(idct_8x8_internal).pass1_end 2463 2464.pass1_end: 2465 SAVE_8ROWS coeffq+16*1, 32 2466 LOAD_8ROWS rsp+gprsize+16*3, 16 2467 mova [rsp+gprsize+16*0], m7 2468 mov tx2q, r3 2469 jmp m(idct_8x8_internal).pass1_end 2470 2471.pass2: 2472 lea tx2q, [o(m(idct_16x8_internal).end)] 2473 lea r3, [dstq+8] 2474 jmp m(idct_8x8_internal).pass2_main 2475 2476.end: 2477 LOAD_8ROWS coeffq+16*1, 32 2478 lea tx2q, [o(m(idct_8x16_internal).end1)] 2479 mov dstq, r3 2480 jmp m(idct_8x8_internal).pass2_main 2481 2482 2483ALIGN function_align 2484.main: 2485 mova [rsp+gprsize*2+16*1], m2 2486 mova [rsp+gprsize*2+16*2], m6 2487 mova [rsp+gprsize*2+32*5], m5 2488 2489 mova m6, [o(pd_2048)] 2490 ITX_MULSUB_2W 0, 7, 2, 5, 6, 401, 4076 ;t8a, t15a 2491 ITX_MULSUB_2W 4, 3, 2, 5, 6, 3166, 2598 ;t9a, t14a 2492 psubsw m2, m0, m4 ;t9 2493 paddsw m0, m4 ;t8 2494 psubsw m4, m7, m3 ;t14 2495 paddsw m7, m3 ;t15 2496 ITX_MULSUB_2W 4, 2, 3, 5, 6, 1567, 3784 ;t9a, t14a 2497 mova m3, [rsp+gprsize*2+16*1] 2498 mova m5, [rsp+gprsize*2+32*5] 2499 mova [rsp+gprsize*2+16*1], m2 2500 mova [rsp+gprsize*2+32*5], m4 2501 mova m2, [rsp+gprsize*2+16*2] 2502 mova [rsp+gprsize*2+16*2], m7 2503 ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a 2504 ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a 2505 psubsw m4, m2, m3 ;t10 2506 paddsw m2, m3 ;t11 2507 psubsw m3, m1, m5 ;t13 2508 paddsw m1, m5 ;t12 2509 ITX_MULSUB_2W 3, 4, 7, 5, 6, m3784, 1567 ;t10a, t13a 2510 mova m7, [rsp+gprsize*2+32*5] 2511 psubsw m6, m0, m2 ;t11a 2512 paddsw m0, m2 ;t8a 2513 paddsw m2, m7, m3 ;t9 2514 psubsw m7, m3 ;t10 2515 mova m5, [rsp+gprsize*2+16*0] 2516 psubsw m3, m5, m0 ;out8 2517 paddsw m0, m5 ;out7 2518 mova [rsp+gprsize*2+32*5], m0 2519 mova m5, [rsp+gprsize*2+16*9] 2520 psubsw m0, m5, m2 ;out9 2521 paddsw m2, m5 ;out6 2522 mova [rsp+gprsize*2+16*0], m0 2523 mova [rsp+gprsize*2+16*9], m2 2524 mova m0, [rsp+gprsize*2+16*1] 2525 mova m2, [rsp+gprsize*2+16*2] 2526 mova [rsp+gprsize*2+16*1], m3 2527 psubsw m5, m0, m4 ;t13 2528 paddsw m0, m4 ;t14 2529 mova m3, [o(pd_2048)] 2530 psubsw m4, m2, m1 ;t12a 2531 paddsw m1, m2 ;t15a 2532 mova [rsp+gprsize*2+16*2], m1 2533 ITX_MULSUB_2W 5, 7, 1, 2, 3, 2896, 2896 ;t10a, t13a 2534 ITX_MULSUB_2W 4, 6, 1, 2, 3, 2896, 2896 ;t11, t12 2535 mova m3, [rsp+gprsize*2+16*8] 2536 psubsw m2, m3, m5 ;out10 2537 paddsw m3, m5 ;out5 2538 mova m5, [rsp+gprsize*2+16*7] 2539 mova [rsp+gprsize*2+16*8], m3 2540 psubsw m3, m5, m4 ;out11 2541 paddsw m5, m4 ;out4 2542 mova m4, [rsp+gprsize*2+16*6] 2543 mova [rsp+gprsize*2+16*7], m5 2544 paddsw m5, m4, m6 ;out3 2545 psubsw m4, m6 ;out12 2546 mova m6, [rsp+gprsize*2+16*5] 2547 mova [rsp+gprsize*2+16*6], m5 2548 psubsw m5, m6, m7 ;out13 2549 paddsw m6, m7 ;out2 2550 mova m7, [rsp+gprsize*2+16*4] 2551 mova [rsp+gprsize*2+16*5], m6 2552 psubsw m6, m7, m0 ;out14 2553 paddsw m7, m0 ;out1 2554 mova m1, [rsp+gprsize*2+16*2] 2555 mova m0, [rsp+gprsize*2+16*3] 2556 mova [rsp+gprsize*2+16*4], m7 2557 psubsw m7, m0, m1 ;out15 2558 paddsw m0, m1 ;out0 2559 mova [rsp+gprsize*2+16*3], m0 2560 mova m1, [rsp+gprsize*2+16*0] 2561 mova m0, [rsp+gprsize*2+16*1] 2562 mova [rsp+gprsize*2+16*0], m7 2563 ret 2564 2565INV_TXFM_16X8_FN adst, dct 2566INV_TXFM_16X8_FN adst, adst 2567INV_TXFM_16X8_FN adst, flipadst 2568INV_TXFM_16X8_FN adst, identity 2569 2570cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 2571 mova m7, [o(pw_2896x8)] 2572 pmulhrsw m0, m7, [coeffq+16*0 ] 2573 pmulhrsw m1, m7, [coeffq+16*1 ] 2574 pmulhrsw m2, m7, [coeffq+16*14] 2575 pmulhrsw m3, m7, [coeffq+16*15] 2576 mova [rsp+gprsize+16*7], m0 2577 mova [rsp+gprsize+16*8], m1 2578 mova [rsp+gprsize+16*9], m2 2579 mova [rsp+gprsize+32*5], m3 2580 pmulhrsw m0, m7, [coeffq+16*6 ] 2581 pmulhrsw m1, m7, [coeffq+16*7 ] 2582 pmulhrsw m2, m7, [coeffq+16*8 ] 2583 pmulhrsw m3, m7, [coeffq+16*9 ] 2584 mova [rsp+gprsize+16*3], m2 2585 mova [rsp+gprsize+16*4], m3 2586 mova [rsp+gprsize+16*5], m0 2587 mova [rsp+gprsize+16*6], m1 2588 pmulhrsw m0, m7, [coeffq+16*2 ] 2589 pmulhrsw m1, m7, [coeffq+16*3 ] 2590 pmulhrsw m2, m7, [coeffq+16*4 ] 2591 pmulhrsw m3, m7, [coeffq+16*5 ] 2592 pmulhrsw m4, m7, [coeffq+16*10] 2593 pmulhrsw m5, m7, [coeffq+16*11] 2594 pmulhrsw m6, m7, [coeffq+16*12] 2595 pmulhrsw m7, [coeffq+16*13] 2596 2597 call .main 2598 call .main_pass1_end 2599 mov r3, tx2q 2600 lea tx2q, [o(m(iadst_16x8_internal).pass1_end)] 2601 jmp m(iadst_8x8_internal).pass1_end 2602 2603.pass1_end: 2604 SAVE_8ROWS coeffq+16*1, 32 2605 LOAD_8ROWS rsp+gprsize+16*3, 16 2606 mova [rsp+gprsize+16*0], m7 2607 mov tx2q, r3 2608 jmp m(iadst_8x8_internal).pass1_end 2609 2610.pass2: 2611 lea tx2q, [o(m(iadst_16x8_internal).end)] 2612 lea r3, [dstq+8] 2613 jmp m(iadst_8x8_internal).pass2_main 2614 2615.end: 2616 LOAD_8ROWS coeffq+16*1, 32 2617 lea tx2q, [o(m(idct_8x16_internal).end1)] 2618 mov dstq, r3 2619 jmp m(iadst_8x8_internal).pass2_main 2620 2621ALIGN function_align 2622.main: 2623 mova [rsp+gprsize*2+16*0], m1 2624 mova [rsp+gprsize*2+16*1], m2 2625 mova [rsp+gprsize*2+16*2], m6 2626 2627 mova m6, [o(pd_2048)] 2628 ITX_MULSUB_2W 7, 0, 1, 2, 6, 995, 3973 ;t3, t2 2629 ITX_MULSUB_2W 3, 4, 1, 2, 6, 3513, 2106 ;t11, t10 2630 psubsw m1, m0, m4 ;t10a 2631 paddsw m0, m4 ;t2a 2632 psubsw m4, m7, m3 ;t11a 2633 paddsw m3, m7 ;t3a 2634 ITX_MULSUB_2W 1, 4, 7, 2, 6, 3406, 2276 ;t11, t10 2635 mova m2, [rsp+gprsize*2+16*0] ;in3 2636 mova m7, [rsp+gprsize*2+16*1] ;in4 2637 mova [rsp+gprsize*2+16*0], m1 ;t11 2638 mova [rsp+gprsize*2+16*1], m4 ;t10 2639 mova m1, [rsp+gprsize*2+16*2] ;in12 2640 mova [rsp+gprsize*2+16*2], m0 ;t2a 2641 ITX_MULSUB_2W 5, 7, 0, 4, 6, 1751, 3703 ;t5, t4 2642 ITX_MULSUB_2W 2, 1, 0, 4, 6, 3857, 1380 ;t13, t12 2643 psubsw m0, m7, m1 ;t12a 2644 paddsw m1, m7 ;t4a 2645 psubsw m4, m5, m2 ;t13a 2646 paddsw m5, m2 ;t5a 2647 ITX_MULSUB_2W 4, 0, 7, 2, 6, 4017, 799 ;t12, t13 2648 mova m2, [rsp+gprsize*2+16*8] ;in1 2649 mova m7, [rsp+gprsize*2+16*9] ;in14 2650 mova [rsp+gprsize*2+16*8], m4 ;t12 2651 mova [rsp+gprsize*2+16*9], m0 ;t13 2652 mova m4, [rsp+gprsize*2+16*4] ;in9 2653 mova m0, [rsp+gprsize*2+16*5] ;in6 2654 mova [rsp+gprsize*2+16*4], m1 ;t4a 2655 mova [rsp+gprsize*2+16*5], m5 ;t5a 2656 ITX_MULSUB_2W 2, 7, 1, 5, 6, 4052, 601 ;t15, t14 2657 ITX_MULSUB_2W 4, 0, 1, 5, 6, 2440, 3290 ;t7, t6 2658 psubsw m1, m0, m7 ;t14a 2659 paddsw m0, m7 ;t6a 2660 psubsw m5, m4, m2 ;t15a 2661 paddsw m4, m2 ;t7a 2662 ITX_MULSUB_2W 5, 1, 7, 2, 6, 2276, 3406 ;t14, t15 2663 mova m2, [rsp+gprsize*2+16*2] ;t2a 2664 mova [rsp+gprsize*2+16*2], m5 ;t14 2665 psubsw m7, m2, m0 ;t6 2666 paddsw m2, m0 ;t2 2667 psubsw m0, m3, m4 ;t7 2668 paddsw m3, m4 ;t3 2669 ITX_MULSUB_2W 0, 7, 4, 5, 6, 3784, 1567 ;t6a, t7a 2670 mova m4, [rsp+gprsize*2+16*7] ;in0 2671 mova m5, [rsp+gprsize*2+32*5] ;in15 2672 mova [rsp+gprsize*2+16*7], m3 ;t3 2673 mova [rsp+gprsize*2+32*5], m1 ;t15 2674 mova m1, [rsp+gprsize*2+16*6] ;in7 2675 mova m3, [rsp+gprsize*2+16*3] ;in8 2676 mova [rsp+gprsize*2+16*6], m7 ;t7a 2677 mova [rsp+gprsize*2+16*3], m0 ;t6a 2678 ITX_MULSUB_2W 5, 4, 0, 7, 6, 201, 4091 ;t1, t0 2679 ITX_MULSUB_2W 1, 3, 0, 7, 6, 3035, 2751 ;t9, t8 2680 psubsw m0, m4, m3 ;t8a 2681 paddsw m4, m3 ;t0a 2682 psubsw m3, m5, m1 ;t9a 2683 paddsw m5, m1 ;t1a 2684 ITX_MULSUB_2W 0, 3, 1, 7, 6, 799, 4017 ;t9, t8 2685 mova m1, [rsp+gprsize*2+16*4] ;t4a 2686 mova m7, [rsp+gprsize*2+16*5] ;t5a 2687 mova [rsp+gprsize*2+16*4], m3 ;t8 2688 mova [rsp+gprsize*2+16*5], m0 ;t9 2689 psubsw m0, m4, m1 ;t4 2690 paddsw m4, m1 ;t0 2691 psubsw m3, m5, m7 ;t5 2692 paddsw m5, m7 ;t1 2693 ITX_MULSUB_2W 0, 3, 1, 7, 6, 1567, 3784 ;t5a, t4a 2694 mova m7, [rsp+gprsize*2+16*3] ;t6a 2695 psubsw m1, m4, m2 ;t2a 2696 paddsw m4, m2 ;out0 2697 mova [rsp+gprsize*2+16*3], m4 ;out0 2698 mova m4, [rsp+gprsize*2+16*6] ;t7a 2699 psubsw m2, m3, m7 ;t6 2700 paddsw m3, m7 ;-out3 2701 mova [rsp+gprsize*2+16*6], m3 ;-out3 2702 psubsw m3, m0, m4 ;t7 2703 paddsw m0, m4 ;out12 2704 mova [rsp+gprsize*2+16*12], m3 2705 mova m3, [rsp+gprsize*2+16*7] ;t3 2706 mova [rsp+gprsize*2+16* 7], m2 ;out4 2707 psubsw m2, m5, m3 ;t3a 2708 paddsw m5, m3 ;-out15 2709 mova [rsp+gprsize*2+16*11], m2 2710 mova m2, [rsp+gprsize*2+32*5] ;t15 2711 mova [rsp+gprsize*2+16*10], m1 ;-out7 2712 mova m1, [rsp+gprsize*2+16*0] ;t11 2713 mova [rsp+gprsize*2+16*0 ], m5 ;-out15 2714 mova m3, [rsp+gprsize*2+16*1] ;t10 2715 mova [rsp+gprsize*2+16*1 ], m4 ;-out11 2716 mova m4, [rsp+gprsize*2+16*2] ;t14 2717 mova [rsp+gprsize*2+16*2 ], m0 ;out12 2718 psubsw m0, m3, m4 ;t14a 2719 paddsw m3, m4 ;t10a 2720 psubsw m5, m1, m2 ;t15a 2721 paddsw m1, m2 ;t11a 2722 ITX_MULSUB_2W 5, 0, 2, 4, 6, 3784, 1567 ;t14, t15 2723 mova m2, [rsp+gprsize*2+16*4] ;t8 2724 mova m4, [rsp+gprsize*2+16*5] ;t9 2725 mova [rsp+gprsize*2+16*4], m3 ;t10a 2726 mova [rsp+gprsize*2+16*5], m1 ;t11a 2727 mova m3, [rsp+gprsize*2+16*8] ;t12 2728 mova m1, [rsp+gprsize*2+16*9] ;t13 2729 mova [rsp+gprsize*2+16*8], m5 ;t14 2730 mova [rsp+gprsize*2+16*9], m0 ;t15 2731 psubsw m5, m2, m3 ;t12a 2732 paddsw m2, m3 ;t8a 2733 psubsw m0, m4, m1 ;t13a 2734 paddsw m4, m1 ;t9a 2735 ITX_MULSUB_2W 5, 0, 1, 3, 6, 1567, 3784 ;t13, t12 2736 mova m6, [rsp+gprsize*2+16*4] ;t10a 2737 mova m1, [rsp+gprsize*2+16*5] ;t11a 2738 psubsw m3, m2, m6 ;t10 2739 paddsw m2, m6 ;-out1 2740 paddsw m6, m4, m1 ;out14 2741 psubsw m4, m1 ;t11 2742 mova [rsp+gprsize*2+16*14], m4 2743 mova [rsp+gprsize*2+16* 4], m2 ;-out1 2744 mova m4, [rsp+gprsize*2+16*8] ;t14 2745 mova m2, [rsp+gprsize*2+16*9] ;t15 2746 mova [rsp+gprsize*2+16* 9], m3 ;out6 2747 psubsw m3, m0, m4 ;t14a 2748 paddsw m0, m4 ;out2 2749 psubsw m4, m5, m2 ;t15a 2750 paddsw m5, m2 ;-out13 2751 mova [rsp+gprsize*2+16* 5], m0 ;out2 2752 ret 2753ALIGN function_align 2754.main_pass1_end: 2755 mova m0, [rsp+gprsize*2+16*14] 2756 mova [rsp+gprsize*2+16*14], m5 2757 mova [rsp+gprsize*2+16*15], m6 2758 mova m5, [o(pw_2896_2896)] 2759 mova m6, [o(pw_2896_m2896)] 2760 mova m7, [o(pd_2048)] 2761 punpcklwd m2, m3, m4 2762 punpckhwd m3, m4 2763 pmaddwd m4, m5, m2 2764 pmaddwd m2, m6 2765 pmaddwd m1, m5, m3 2766 pmaddwd m3, m6 2767 REPX {paddd x, m7}, m4, m2, m1, m3 2768 REPX {psrad x, 12}, m4, m1, m2, m3 2769 packssdw m4, m1 ;-out5 2770 packssdw m2, m3 ;out10 2771 mova [rsp+gprsize*2+16* 8], m4 2772 mova m3, [rsp+gprsize*2+16* 9] 2773 punpcklwd m1, m3, m0 2774 punpckhwd m3, m0 2775 pmaddwd m0, m5, m1 2776 pmaddwd m1, m6 2777 pmaddwd m4, m5, m3 2778 pmaddwd m3, m6 2779 REPX {paddd x, m7}, m0, m1, m4, m3 2780 REPX {psrad x, 12}, m0, m4, m1, m3 2781 packssdw m0, m4 ;out6 2782 packssdw m1, m3 ;-out9 2783 mova [rsp+gprsize*2+16* 9], m0 2784 mova m0, [rsp+gprsize*2+16* 7] 2785 mova m4, [rsp+gprsize*2+16*12] 2786 punpcklwd m3, m0, m4 2787 punpckhwd m0, m4 2788 pmaddwd m4, m5, m3 2789 pmaddwd m3, m6 2790 pmaddwd m5, m0 2791 pmaddwd m0, m6 2792 REPX {paddd x, m7}, m4, m3, m5, m0 2793 REPX {psrad x, 12}, m4, m5, m3, m0 2794 packssdw m4, m5 ;out4 2795 packssdw m3, m0 ;-out11 2796 mova [rsp+gprsize*2+16* 7], m4 2797 mova m4, [rsp+gprsize*2+16*10] 2798 mova m5, [rsp+gprsize*2+16*11] 2799 punpcklwd m0, m4, m5 2800 punpckhwd m4, m5 2801 pmaddwd m5, m0, [o(pw_2896_2896)] 2802 pmaddwd m0, m6 2803 pmaddwd m6, m4 2804 pmaddwd m4, [o(pw_2896_2896)] 2805 REPX {paddd x, m7}, m5, m0, m6, m4 2806 REPX {psrad x, 12}, m0, m6, m5, m4 2807 packssdw m0, m6 ;out8 2808 packssdw m5, m4 ;-out7 2809 mova [rsp+gprsize*2+16*10], m5 2810 mova m4, [rsp+gprsize*2+16* 2] ;out12 2811 mova m5, [rsp+gprsize*2+16*14] ;-out13 2812 mova m6, [rsp+gprsize*2+16*15] ;out14 2813 ret 2814ALIGN function_align 2815.main_pass2_end: 2816 mova m7, [o(pw_2896x8)] 2817 mova m1, [rsp+gprsize*2+16* 9] 2818 mova m2, [rsp+gprsize*2+16*14] 2819 paddsw m0, m1, m2 2820 psubsw m1, m2 2821 pmulhrsw m0, m7 ;out6 2822 pmulhrsw m1, m7 ;-out9 2823 mova [rsp+gprsize*2+16* 9], m0 2824 psubsw m2, m3, m4 2825 paddsw m3, m4 2826 pmulhrsw m2, m7 ;out10 2827 pmulhrsw m3, m7 ;-out5 2828 mova [rsp+gprsize*2+16* 8], m3 2829 mova m3, [rsp+gprsize*2+16* 7] 2830 mova m4, [rsp+gprsize*2+16*12] 2831 paddsw m0, m3, m4 2832 psubsw m3, m4 2833 pmulhrsw m0, m7 ;out4 2834 pmulhrsw m3, m7 ;-out11 2835 mova [rsp+gprsize*2+16* 7], m0 2836 mova m0, [rsp+gprsize*2+16*10] 2837 paddsw m4, m0, [rsp+gprsize*2+16*11] 2838 psubsw m0, [rsp+gprsize*2+16*11] 2839 pmulhrsw m4, m7 ;-out7 2840 pmulhrsw m0, m7 ;out8 2841 mova [rsp+gprsize*2+16*10], m4 2842 mova m4, [rsp+gprsize*2+16*2 ] ;out12 2843 ret 2844 2845INV_TXFM_16X8_FN flipadst, dct 2846INV_TXFM_16X8_FN flipadst, adst 2847INV_TXFM_16X8_FN flipadst, flipadst 2848INV_TXFM_16X8_FN flipadst, identity 2849 2850cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 2851 mova m7, [o(pw_2896x8)] 2852 pmulhrsw m0, m7, [coeffq+16*0 ] 2853 pmulhrsw m1, m7, [coeffq+16*1 ] 2854 pmulhrsw m2, m7, [coeffq+16*14] 2855 pmulhrsw m3, m7, [coeffq+16*15] 2856 mova [rsp+gprsize+16*7], m0 2857 mova [rsp+gprsize+16*8], m1 2858 mova [rsp+gprsize+16*9], m2 2859 mova [rsp+gprsize+32*5], m3 2860 pmulhrsw m0, m7, [coeffq+16*6 ] 2861 pmulhrsw m1, m7, [coeffq+16*7 ] 2862 pmulhrsw m2, m7, [coeffq+16*8 ] 2863 pmulhrsw m3, m7, [coeffq+16*9 ] 2864 mova [rsp+gprsize+16*3], m2 2865 mova [rsp+gprsize+16*4], m3 2866 mova [rsp+gprsize+16*5], m0 2867 mova [rsp+gprsize+16*6], m1 2868 pmulhrsw m0, m7, [coeffq+16*2 ] 2869 pmulhrsw m1, m7, [coeffq+16*3 ] 2870 pmulhrsw m2, m7, [coeffq+16*4 ] 2871 pmulhrsw m3, m7, [coeffq+16*5 ] 2872 pmulhrsw m4, m7, [coeffq+16*10] 2873 pmulhrsw m5, m7, [coeffq+16*11] 2874 pmulhrsw m6, m7, [coeffq+16*12] 2875 pmulhrsw m7, [coeffq+16*13] 2876 2877 call m(iadst_16x8_internal).main 2878 call m(iadst_16x8_internal).main_pass1_end 2879 2880 mova m7, [rsp+gprsize+16*0] 2881 SAVE_8ROWS coeffq+16*0, 32 2882 LOAD_8ROWS rsp+gprsize+16*3, 16 2883 mova [rsp+gprsize+16*0], m7 2884 mov r3, tx2q 2885 lea tx2q, [o(m(iflipadst_16x8_internal).pass1_end)] 2886 jmp m(iflipadst_8x8_internal).pass1_end 2887 2888.pass1_end: 2889 SAVE_8ROWS coeffq+16*1, 32 2890 LOAD_8ROWS coeffq+16*0, 32 2891 mova [rsp+gprsize+16*0], m7 2892 mov tx2q, r3 2893 jmp m(iflipadst_8x8_internal).pass1_end 2894 2895.pass2: 2896 lea tx2q, [o(m(iflipadst_16x8_internal).end)] 2897 lea r3, [dstq+8] 2898 jmp m(iflipadst_8x8_internal).pass2_main 2899 2900.end: 2901 LOAD_8ROWS coeffq+16*1, 32 2902 lea tx2q, [o(m(idct_8x16_internal).end1)] 2903 mov dstq, r3 2904 jmp m(iflipadst_8x8_internal).pass2_main 2905 2906 2907INV_TXFM_16X8_FN identity, dct 2908INV_TXFM_16X8_FN identity, adst 2909INV_TXFM_16X8_FN identity, flipadst 2910INV_TXFM_16X8_FN identity, identity 2911 2912cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 2913 add coeffq, 16*16 2914 mova m4, [coeffq-16*7] 2915 mova m5, [coeffq-16*5] 2916 mova m6, [coeffq-16*3] 2917 mova m7, [coeffq-16*1] 2918 mov r3, tx2q 2919 lea tx2q, [o(m(iidentity_16x8_internal).pass1_end)] 2920 2921.pass1: 2922 mova m0, [o(pw_2896x8)] 2923 mova m2, [o(pw_1697x16)] 2924 mova m3, [o(pw_16384)] 2925 sub coeffq, 8*16 2926 REPX {pmulhrsw x, m0}, m4, m5, m6, m7 2927 pmulhrsw m1, m2, m4 2928 pmulhrsw m1, m3 2929 paddsw m1, m4 ; 1 2930 pmulhrsw m4, m2, m5 2931 pmulhrsw m4, m3 2932 paddsw m4, m5 ; 3 2933 pmulhrsw m5, m2, m6 2934 pmulhrsw m5, m3 2935 paddsw m5, m6 ; 5 2936 pmulhrsw m6, m2, m7 2937 pmulhrsw m6, m3 2938 paddsw m7, m6 ; 7 2939 pmulhrsw m6, m0, [coeffq+16*6] 2940 mova [rsp+gprsize+16*0], m4 2941 pmulhrsw m4, m2, m6 2942 pmulhrsw m4, m3 2943 paddsw m6, m4 ; 6 2944 pmulhrsw m4, m0, [coeffq+16*4] 2945 mova [rsp+gprsize+16*1], m6 2946 pmulhrsw m6, m2, m4 2947 pmulhrsw m6, m3 2948 paddsw m4, m6 ; 4 2949 pmulhrsw m6, m0, [coeffq+16*2] 2950 pmulhrsw m0, [coeffq+16*0] 2951 pmulhrsw m2, m6 2952 pmulhrsw m2, m3 2953 paddsw m2, m6 ; 2 2954 pmulhrsw m6, m0, [o(pw_1697x16)] 2955 pmulhrsw m6, m3 2956 mova m3, [rsp+gprsize+16*0] 2957 paddsw m0, m6 2958 jmp m(idct_8x8_internal).pass1_end3 2959 2960.pass1_end: 2961 mova [coeffq+16*1], m4 2962 mova [coeffq+16*3], m5 2963 mova [coeffq+16*5], m6 2964 mova [coeffq+16*7], m7 2965 mova m4, [coeffq-16*7] 2966 mova m5, [coeffq-16*5] 2967 mova m6, [coeffq-16*3] 2968 mova m7, [coeffq-16*1] 2969 mova [coeffq-16*7], m0 2970 mova [coeffq-16*5], m1 2971 mova [coeffq-16*3], m2 2972 mova [coeffq-16*1], m3 2973 mov tx2q, r3 2974 jmp .pass1 2975 2976.pass2: 2977 lea tx2q, [o(m(iidentity_16x8_internal).end)] 2978 lea r3, [dstq+8] 2979 jmp m(iidentity_8x8_internal).end 2980 2981.end: 2982 LOAD_8ROWS coeffq+16*1, 32 2983 lea tx2q, [o(m(idct_8x16_internal).end1)] 2984 mov dstq, r3 2985 jmp m(iidentity_8x8_internal).end 2986 2987 2988%macro INV_TXFM_16X16_FN 2 ; type1, type2 2989 INV_TXFM_FN %1, %2, 16x16, 8, 16*16 2990%ifidn %1_%2, dct_dct 2991 movd m1, [o(pw_2896x8)] 2992 pmulhrsw m0, m1, [coeffq] 2993 movd m2, [o(pw_8192)] 2994 mov [coeffq], eobd 2995 mov r2d, 8 2996 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16).end)] 2997 jmp m(inv_txfm_add_dct_dct_16x4).dconly 2998.end: 2999 RET 3000%endif 3001%endmacro 3002 3003INV_TXFM_16X16_FN dct, dct 3004INV_TXFM_16X16_FN dct, adst 3005INV_TXFM_16X16_FN dct, flipadst 3006INV_TXFM_16X16_FN dct, identity 3007 3008cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 3009 LOAD_8ROWS coeffq+16*1, 64 3010 call m(idct_8x8_internal).main 3011 SAVE_7ROWS rsp+gprsize+16*3, 16 3012 LOAD_8ROWS coeffq+16*3, 64 3013 call m(idct_16x8_internal).main 3014 mov r3, tx2q 3015 lea tx2q, [o(m(idct_16x16_internal).pass1_end)] 3016 mova m7, [o(pw_8192)] 3017 jmp m(idct_8x8_internal).pass1_end1 3018 3019.pass1_end: 3020 SAVE_8ROWS coeffq+16*17, 32 3021 LOAD_8ROWS rsp+gprsize+16*3, 16 3022 mova [rsp+gprsize+16*0], m7 3023 lea tx2q, [o(m(idct_16x16_internal).pass1_end1)] 3024 mova m7, [o(pw_8192)] 3025 jmp m(idct_8x8_internal).pass1_end1 3026 3027.pass1_end1: 3028 SAVE_8ROWS coeffq+16*1, 32 3029 LOAD_8ROWS coeffq+16*0, 64 3030 call m(idct_8x8_internal).main 3031 SAVE_7ROWS rsp+gprsize+16*3, 16 3032 LOAD_8ROWS coeffq+16*2, 64 3033 call m(idct_16x8_internal).main 3034 lea tx2q, [o(m(idct_16x16_internal).pass1_end2)] 3035 mova m7, [o(pw_8192)] 3036 jmp m(idct_8x8_internal).pass1_end1 3037 3038.pass1_end2: 3039 SAVE_8ROWS coeffq+16*16, 32 3040 LOAD_8ROWS rsp+gprsize+16*3, 16 3041 mova [rsp+gprsize+16*0], m7 3042 mov tx2q, r3 3043 mova m7, [o(pw_8192)] 3044 jmp m(idct_8x8_internal).pass1_end1 3045 3046.pass2: 3047 lea tx2q, [o(m(idct_16x16_internal).end)] 3048 jmp m(idct_8x16_internal).pass2_pre 3049 3050.end: 3051 LOAD_8ROWS rsp+gprsize+16*3, 16 3052 mova [rsp+gprsize+16*0], m7 3053 lea tx2q, [o(m(idct_16x16_internal).end1)] 3054 mov dstq, r3 3055 lea r3, [dstq+8] 3056 jmp m(idct_8x8_internal).end 3057 3058.end1: 3059 pxor m7, m7 3060 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 3061 3062 add coeffq, 32*8 3063 mov dstq, r3 3064 3065 mova m0, [coeffq+16*0 ] 3066 mova m1, [coeffq+16*4 ] 3067 mova m2, [coeffq+16*8 ] 3068 mova m3, [coeffq+16*12] 3069 mova m4, [coeffq+16*1 ] 3070 mova m5, [coeffq+16*5 ] 3071 mova m6, [coeffq+16*9 ] 3072 mova m7, [coeffq+16*13] 3073 lea tx2q, [o(m(idct_8x16_internal).end)] 3074 jmp m(idct_8x16_internal).pass2_main 3075 3076 3077%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0 3078 mova m0, [coeffq+16*1 ] 3079 mova m1, [coeffq+16*3 ] 3080 mova m2, [coeffq+16*29] 3081 mova m3, [coeffq+16*31] 3082 mova [rsp+gprsize+16*7], m0 3083 mova [rsp+gprsize+16*8], m1 3084 mova [rsp+gprsize+16*9], m2 3085 mova [rsp+gprsize+32*5], m3 3086 mova m0, [coeffq+16*13] 3087 mova m1, [coeffq+16*15] 3088 mova m2, [coeffq+16*17] 3089 mova m3, [coeffq+16*19] 3090 mova [rsp+gprsize+16*3], m2 3091 mova [rsp+gprsize+16*4], m3 3092 mova [rsp+gprsize+16*5], m0 3093 mova [rsp+gprsize+16*6], m1 3094 mova m0, [coeffq+16*5 ] 3095 mova m1, [coeffq+16*7 ] 3096 mova m2, [coeffq+16*9 ] 3097 mova m3, [coeffq+16*11] 3098 mova m4, [coeffq+16*21] 3099 mova m5, [coeffq+16*23] 3100 mova m6, [coeffq+16*25] 3101 mova m7, [coeffq+16*27] 3102%endmacro 3103 3104%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0 3105 mova m0, [coeffq+16*0 ] 3106 mova m1, [coeffq+16*2 ] 3107 mova m2, [coeffq+16*28] 3108 mova m3, [coeffq+16*30] 3109 mova [rsp+gprsize+16*7], m0 3110 mova [rsp+gprsize+16*8], m1 3111 mova [rsp+gprsize+16*9], m2 3112 mova [rsp+gprsize+32*5], m3 3113 mova m0, [coeffq+16*12] 3114 mova m1, [coeffq+16*14] 3115 mova m2, [coeffq+16*16] 3116 mova m3, [coeffq+16*18] 3117 mova [rsp+gprsize+16*3], m2 3118 mova [rsp+gprsize+16*4], m3 3119 mova [rsp+gprsize+16*5], m0 3120 mova [rsp+gprsize+16*6], m1 3121 mova m0, [coeffq+16*4 ] 3122 mova m1, [coeffq+16*6 ] 3123 mova m2, [coeffq+16*8 ] 3124 mova m3, [coeffq+16*10] 3125 mova m4, [coeffq+16*20] 3126 mova m5, [coeffq+16*22] 3127 mova m6, [coeffq+16*24] 3128 mova m7, [coeffq+16*26] 3129%endmacro 3130 3131INV_TXFM_16X16_FN adst, dct 3132INV_TXFM_16X16_FN adst, adst 3133INV_TXFM_16X16_FN adst, flipadst 3134 3135cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 3136 ITX_16X16_ADST_LOAD_ODD_COEFS 3137 call m(iadst_16x8_internal).main 3138 call m(iadst_16x8_internal).main_pass1_end 3139 3140 mov r3, tx2q 3141 lea tx2q, [o(m(iadst_16x16_internal).pass1_end)] 3142 mova m7, [o(pw_8192)] 3143 jmp m(iadst_8x8_internal).pass1_end1 3144 3145.pass1_end: 3146 SAVE_8ROWS coeffq+16*17, 32 3147 LOAD_8ROWS rsp+gprsize+16*3, 16 3148 mova [rsp+gprsize+16*0], m7 3149 lea tx2q, [o(m(iadst_16x16_internal).pass1_end1)] 3150 mova m7, [o(pw_8192)] 3151 jmp m(iadst_8x8_internal).pass1_end1 3152 3153.pass1_end1: 3154 SAVE_8ROWS coeffq+16*1, 32 3155 ITX_16X16_ADST_LOAD_EVEN_COEFS 3156 call m(iadst_16x8_internal).main 3157 call m(iadst_16x8_internal).main_pass1_end 3158 3159 lea tx2q, [o(m(iadst_16x16_internal).pass1_end2)] 3160 mova m7, [o(pw_8192)] 3161 jmp m(iadst_8x8_internal).pass1_end1 3162 3163.pass1_end2: 3164 SAVE_8ROWS coeffq+16*16, 32 3165 LOAD_8ROWS rsp+gprsize+16*3, 16 3166 mova [rsp+gprsize+16*0], m7 3167 mov tx2q, r3 3168 mova m7, [o(pw_8192)] 3169 jmp m(iadst_8x8_internal).pass1_end1 3170 3171.pass2: 3172 lea tx2q, [o(m(iadst_16x16_internal).end)] 3173 jmp m(iadst_8x16_internal).pass2_pre 3174 3175.end: 3176 LOAD_8ROWS rsp+gprsize+16*3, 16 3177 mova [rsp+gprsize+16*0], m7 3178 lea tx2q, [o(m(iadst_16x16_internal).end1)] 3179 mov dstq, r3 3180 lea r3, [dstq+8] 3181 jmp m(iadst_8x8_internal).end 3182 3183.end1: 3184 pxor m7, m7 3185 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 3186 3187 add coeffq, 32*8 3188 mov dstq, r3 3189 3190 mova m4, [coeffq+16*0 ] 3191 mova m5, [coeffq+16*2 ] 3192 mova m0, [coeffq+16*4 ] 3193 mova m1, [coeffq+16*6 ] 3194 mova m2, [coeffq+16*8 ] 3195 mova m3, [coeffq+16*10] 3196 mova m6, [coeffq+16*12] 3197 mova m7, [coeffq+16*14] 3198 mova [rsp+gprsize+16*7], m4 3199 mova [rsp+gprsize+16*8], m5 3200 mova [rsp+gprsize+16*5], m6 3201 mova [rsp+gprsize+16*6], m7 3202 lea tx2q, [o(m(iadst_8x16_internal).end)] 3203 jmp m(iadst_8x16_internal).pass2_main 3204 3205 3206INV_TXFM_16X16_FN flipadst, dct 3207INV_TXFM_16X16_FN flipadst, adst 3208INV_TXFM_16X16_FN flipadst, flipadst 3209 3210cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 3211 ITX_16X16_ADST_LOAD_ODD_COEFS 3212 call m(iadst_16x8_internal).main 3213 call m(iadst_16x8_internal).main_pass1_end 3214 3215 mov r3, tx2q 3216 lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end)] 3217 mova m7, [o(pw_m8192)] 3218 jmp m(iflipadst_8x8_internal).pass1_end1 3219 3220.pass1_end: 3221 SAVE_8ROWS coeffq+16*1, 32 3222 LOAD_8ROWS rsp+gprsize+16*3, 16 3223 mova [rsp+gprsize+16*0], m7 3224 lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end1)] 3225 mova m7, [o(pw_m8192)] 3226 jmp m(iflipadst_8x8_internal).pass1_end1 3227 3228.pass1_end1: 3229 SAVE_8ROWS coeffq+16*17, 32 3230 ITX_16X16_ADST_LOAD_EVEN_COEFS 3231 call m(iadst_16x8_internal).main 3232 call m(iadst_16x8_internal).main_pass1_end 3233 3234 mova m7, [rsp+gprsize+16*0] 3235 SAVE_8ROWS coeffq+16*0, 32 3236 LOAD_8ROWS rsp+gprsize+16*3, 16 3237 mova [rsp+gprsize+16*0], m7 3238 lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end2)] 3239 mova m7, [o(pw_m8192)] 3240 jmp m(iflipadst_8x8_internal).pass1_end1 3241 3242.pass1_end2: 3243 SAVE_8ROWS coeffq+16*16, 32 3244 LOAD_8ROWS coeffq+16* 0, 32 3245 mova [rsp+gprsize+16*0], m7 3246 mov tx2q, r3 3247 mova m7, [o(pw_m8192)] 3248 jmp m(iflipadst_8x8_internal).pass1_end1 3249 3250.pass2: 3251 lea tx2q, [o(m(iflipadst_16x16_internal).end)] 3252 lea r3, [dstq+8] 3253 jmp m(iflipadst_8x16_internal).pass2_pre 3254 3255.end: 3256 LOAD_8ROWS rsp+gprsize+16*3, 16 3257 mova [rsp+gprsize+16*0], m7 3258 lea tx2q, [o(m(iflipadst_16x16_internal).end1)] 3259 lea dstq, [dstq+strideq*2] 3260 jmp m(iflipadst_8x8_internal).end 3261 3262.end1: 3263 pxor m7, m7 3264 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 3265 3266 add coeffq, 32*8 3267 3268 mova m4, [coeffq+16*0 ] 3269 mova m5, [coeffq+16*2 ] 3270 mova m0, [coeffq+16*4 ] 3271 mova m1, [coeffq+16*6 ] 3272 mova m2, [coeffq+16*8 ] 3273 mova m3, [coeffq+16*10] 3274 mova m6, [coeffq+16*12] 3275 mova m7, [coeffq+16*14] 3276 mova [rsp+gprsize+16*7], m4 3277 mova [rsp+gprsize+16*8], m5 3278 mova [rsp+gprsize+16*5], m6 3279 mova [rsp+gprsize+16*6], m7 3280 3281 lea tx2q, [o(m(iflipadst_16x16_internal).end2)] 3282 mov dstq, r3 3283 jmp m(iflipadst_8x16_internal).pass2_main 3284 3285.end2: 3286 LOAD_8ROWS rsp+gprsize+16*3, 16 3287 mova [rsp+gprsize+16*0], m7 3288 lea tx2q, [o(m(idct_8x16_internal).end1)] 3289 lea dstq, [dstq+strideq*2] 3290 jmp m(iflipadst_8x8_internal).end 3291 3292 3293%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 3294 pmulhrsw m%2, m%3, m%1 3295 psraw m%2, 1 3296 pavgw m%1, m%2 3297%endmacro 3298 3299INV_TXFM_16X16_FN identity, dct 3300INV_TXFM_16X16_FN identity, identity 3301 3302cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 3303 add coeffq, 16*17 3304 mov r3, tx2q 3305 lea tx2q, [o(m(iidentity_16x16_internal).pass1_end)] 3306 3307.pass1: 3308 mova m6, [o(pw_1697x16)] 3309 mova m7, [coeffq+32*6] 3310 mova m0, [coeffq+32*0] 3311 mova m1, [coeffq+32*1] 3312 mova m2, [coeffq+32*2] 3313 mova m3, [coeffq+32*3] 3314 mova m4, [coeffq+32*4] 3315 REPX {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4 3316 mova m5, [coeffq+32*5] 3317 mova [rsp+gprsize+16*1], m7 3318 IDTX16B 5, 7, 6 3319 mova m7, [coeffq+32*7] 3320 IDTX16B 7, 6, 6 3321 jmp m(idct_8x8_internal).pass1_end3 3322 3323.pass1_end: 3324 SAVE_8ROWS coeffq, 32 3325 sub coeffq, 16 3326 lea tx2q, [o(m(iidentity_16x16_internal).pass1_end1)] 3327 jmp .pass1 3328 3329.pass1_end1: 3330 SAVE_8ROWS coeffq, 32 3331 sub coeffq, 15*16 3332 lea tx2q, [o(m(iidentity_16x16_internal).pass1_end2)] 3333 jmp .pass1 3334 3335.pass1_end2: 3336 SAVE_8ROWS coeffq, 32 3337 sub coeffq, 16 3338 mov tx2q, r3 3339 jmp .pass1 3340 3341.pass2: 3342 lea r3, [dstq+8] 3343 lea tx2q, [o(m(iidentity_16x16_internal).end1)] 3344 3345.end: 3346 mova [rsp+gprsize+16*0], m7 3347 mova [rsp+gprsize+16*1], m4 3348 mova m7, [o(pw_1697x16)] 3349 REPX {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3 3350 mova m4, [o(pw_2048)] 3351 pmulhrsw m5, m4 3352 pmulhrsw m6, m4 3353 mova [rsp+gprsize+16*2], m5 3354 mova m5, [rsp+gprsize+16*1] 3355 mova [rsp+gprsize+16*1], m6 3356 IDTX16 5, 6, 7 3357 mova m6, [rsp+gprsize+16*0] 3358 IDTX16 6, 7, 7 3359 REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6 3360 pmulhrsw m4, m5 3361 mova [rsp+gprsize+16*0], m6 3362 jmp m(idct_8x8_internal).end3 3363 3364.end1: 3365 LOAD_8ROWS coeffq+16*1, 32 3366 lea tx2q, [o(m(iidentity_16x16_internal).end2)] 3367 lea dstq, [dstq+strideq*2] 3368 jmp .end 3369 3370.end2: 3371 pxor m7, m7 3372 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 3373 3374 add coeffq, 32*8 3375 LOAD_8ROWS coeffq, 32 3376 lea tx2q, [o(m(iidentity_16x16_internal).end3)] 3377 mov dstq, r3 3378 jmp .end 3379 3380.end3: 3381 LOAD_8ROWS coeffq+16*1, 32 3382 lea tx2q, [o(m(idct_8x16_internal).end1)] 3383 lea dstq, [dstq+strideq*2] 3384 jmp .end 3385 3386 3387cglobal inv_txfm_add_dct_dct_8x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 3388%if ARCH_X86_32 3389 LEA r5, $$ 3390%endif 3391 test eobd, eobd 3392 jz .dconly 3393 call m(idct_8x32_internal) 3394 RET 3395 3396.dconly: 3397 movd m1, [o(pw_2896x8)] 3398 pmulhrsw m0, m1, [coeffq] 3399 movd m2, [o(pw_8192)] 3400 mov [coeffq], eobd 3401 pmulhrsw m0, m2 3402 psrlw m2, 2 ;pw_2048 3403 pmulhrsw m0, m1 3404 pmulhrsw m0, m2 3405 pshuflw m0, m0, q0000 3406 punpcklwd m0, m0 3407 mov r3d, 8 3408 lea tx2q, [o(m(inv_txfm_add_dct_dct_8x32).end)] 3409 jmp m(inv_txfm_add_dct_dct_8x8).loop 3410 3411.end: 3412 RET 3413 3414 3415 3416cglobal idct_8x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 3417 %undef cmp 3418 cmp eobd, 106 3419 jle .fast 3420 3421 LOAD_8ROWS coeffq+16*3, 64 3422 call m(idct_8x8_internal).main 3423 mova m7, [o(pw_8192)] 3424 lea tx2q, [o(m(idct_8x32_internal).pass1)] 3425 jmp m(idct_8x8_internal).pass1_end1 3426 3427.pass1: 3428 mova [rsp+gprsize+16*9 ], m0 ;in24 3429 mova [rsp+gprsize+16*10], m4 ;in28 3430 mova [rsp+gprsize+16*17], m2 ;in26 3431 mova [rsp+gprsize+16*18], m6 ;in30 3432 mova [rsp+gprsize+16*31], m1 ;in25 3433 mova [rsp+gprsize+16*30], m3 ;in27 3434 mova [rsp+gprsize+16*27], m5 ;in29 3435 mova [rsp+gprsize+16*34], m7 ;in31 3436 LOAD_8ROWS coeffq+16*2, 64 3437 call m(idct_8x8_internal).main 3438 mova m7, [o(pw_8192)] 3439 lea tx2q, [o(m(idct_8x32_internal).pass1_1)] 3440 jmp m(idct_8x8_internal).pass1_end1 3441 3442.pass1_1: 3443 mova [rsp+gprsize+16*7 ], m0 ;in16 3444 mova [rsp+gprsize+16*8 ], m4 ;in20 3445 mova [rsp+gprsize+16*15], m2 ;in18 3446 mova [rsp+gprsize+16*16], m6 ;in22 3447 mova [rsp+gprsize+16*33], m1 ;in17 3448 mova [rsp+gprsize+16*28], m3 ;in19 3449 mova [rsp+gprsize+16*29], m5 ;in21 3450 mova [rsp+gprsize+16*32], m7 ;in23 3451 3452.fast: 3453 LOAD_8ROWS coeffq+16*1, 64 3454 call m(idct_8x8_internal).main 3455 mova m7, [o(pw_8192)] 3456 lea tx2q, [o(m(idct_8x32_internal).pass1_end)] 3457 jmp m(idct_8x8_internal).pass1_end1 3458 3459.pass1_end: 3460 mova [rsp+gprsize+16*5 ], m0 ;in8 3461 mova [rsp+gprsize+16*6 ], m4 ;in12 3462 mova [rsp+gprsize+16*13], m2 ;in10 3463 mova [rsp+gprsize+16*14], m6 ;in14 3464 mova [rsp+gprsize+16*21], m1 ;in9 3465 mova [rsp+gprsize+16*24], m3 ;in11 3466 mova [rsp+gprsize+16*25], m5 ;in13 3467 mova [rsp+gprsize+16*20], m7 ;in15 3468 LOAD_8ROWS coeffq+16*0, 64 3469 call m(idct_8x8_internal).main 3470 mova m7, [o(pw_8192)] 3471 lea tx2q, [o(m(idct_8x32_internal).pass1_end1)] 3472 jmp m(idct_8x8_internal).pass1_end1 3473 3474.pass1_end1: 3475 mova [rsp+gprsize+16*11], m2 ;in2 3476 mova [rsp+gprsize+16*12], m6 ;in6 3477 mova [rsp+gprsize+16*19], m1 ;in1 3478 mova [rsp+gprsize+16*26], m3 ;in3 3479 mova [rsp+gprsize+16*23], m5 ;in5 3480 mova [rsp+gprsize+16*22], m7 ;in7 3481 mova m1, m4 ;in4 3482 mova m2, [rsp+gprsize+16*5 ] ;in8 3483 mova m3, [rsp+gprsize+16*6 ] ;in12 3484 3485 cmp eobd, 106 3486 jg .full 3487 3488 pxor m4, m4 3489 REPX {mova x, m4}, m5, m6, m7 3490 call m(idct_8x8_internal).main 3491 SAVE_7ROWS rsp+gprsize+16*3 , 16 3492 mova m0, [rsp+gprsize+16*11] 3493 mova m1, [rsp+gprsize+16*12] 3494 mova m2, [rsp+gprsize+16*13] 3495 mova m3, [rsp+gprsize+16*14] 3496 pxor m4, m4 3497 REPX {mova x, m4}, m5, m6, m7 3498 call m(idct_16x8_internal).main 3499 mova m7, [rsp+gprsize+16*0] 3500 SAVE_8ROWS rsp+gprsize+16*11, 16 3501 3502 call .main_fast 3503 jmp .pass2 3504 3505.full: 3506 mova m4, [rsp+gprsize+16*7 ] ;in16 3507 mova m5, [rsp+gprsize+16*8 ] ;in20 3508 mova m6, [rsp+gprsize+16*9 ] ;in24 3509 mova m7, [rsp+gprsize+16*10] ;in28 3510 call m(idct_8x8_internal).main 3511 SAVE_7ROWS rsp+gprsize+16*3 , 16 3512 LOAD_8ROWS rsp+gprsize+16*11, 16 3513 call m(idct_16x8_internal).main 3514 mova m7, [rsp+gprsize+16*0] 3515 SAVE_8ROWS rsp+gprsize+16*11, 16 3516 call .main 3517 3518.pass2: 3519 lea r3, [o(m(idct_8x32_internal).end6)] 3520 3521.end: 3522 mova [rsp+gprsize+16*0 ], m7 3523 lea tx2q, [o(m(idct_8x32_internal).end2)] 3524 3525.end1: 3526 pxor m7, m7 3527 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \ 3528 8, 9, 10, 11, 12, 13, 14, 15, \ 3529 16, 17, 18, 19, 20, 21, 22, 23, \ 3530 24, 25, 26, 27, 28, 29, 30, 31 3531 3532 jmp tx2q 3533 3534.end2: 3535 lea tx2q, [o(m(idct_8x32_internal).end3)] 3536 jmp m(idct_8x8_internal).end 3537 3538.end3: 3539 LOAD_8ROWS rsp+gprsize+16*11, 16 3540 mova [rsp+gprsize+16*0 ], m7 3541 lea dstq, [dstq+strideq*2] 3542 lea tx2q, [o(m(idct_8x32_internal).end4)] 3543 jmp m(idct_8x8_internal).end 3544 3545.end4: 3546 LOAD_8ROWS rsp+gprsize+16*19, 16 3547 mova [rsp+gprsize+16*0 ], m7 3548 lea dstq, [dstq+strideq*2] 3549 lea tx2q, [o(m(idct_8x32_internal).end5)] 3550 jmp m(idct_8x8_internal).end 3551 3552.end5: 3553 LOAD_8ROWS rsp+gprsize+16*27, 16 3554 mova [rsp+gprsize+16*0 ], m7 3555 lea dstq, [dstq+strideq*2] 3556 mov tx2q, r3 3557 jmp m(idct_8x8_internal).end 3558 3559.end6: 3560 ret 3561 3562ALIGN function_align 3563.main_veryfast: 3564 mova m0, [rsp+gprsize*2+16*19] ;in1 3565 pmulhrsw m3, m0, [o(pw_4091x8)] ;t30,t31 3566 pmulhrsw m0, [o(pw_201x8)] ;t16,t17 3567 mova m7, [o(pd_2048)] 3568 mova [rsp+gprsize*2+16*19], m0 ;t16 3569 mova [rsp+gprsize*2+16*34], m3 ;t31 3570 ITX_MULSUB_2W 3, 0, 1, 2, 7, 799, 4017 ;t17a, t30a 3571 mova [rsp+gprsize*2+16*20], m3 ;t17a 3572 mova [rsp+gprsize*2+16*33], m0 ;t30a 3573 mova m1, [rsp+gprsize*2+16*22] ;in7 3574 pmulhrsw m2, m1, [o(pw_3857x8)] ;t28,t29 3575 pmulhrsw m1, [o(pw_m1380x8)] ;t18,t19 3576 mova [rsp+gprsize*2+16*22], m1 ;t19 3577 mova [rsp+gprsize*2+16*31], m2 ;t28 3578 ITX_MULSUB_2W 2, 1, 0, 3, 7, m4017, 799 ;t18a, t29a 3579 mova [rsp+gprsize*2+16*21], m2 ;t18a 3580 mova [rsp+gprsize*2+16*32], m1 ;t29a 3581 mova m0, [rsp+gprsize*2+16*23] ;in5 3582 pmulhrsw m3, m0, [o(pw_3973x8)] ;t26, t27 3583 pmulhrsw m0, [o(pw_995x8)] ;t20, t21 3584 mova [rsp+gprsize*2+16*23], m0 ;t20 3585 mova [rsp+gprsize*2+16*30], m3 ;t27 3586 ITX_MULSUB_2W 3, 0, 1, 2, 7, 3406, 2276 ;t21a, t26a 3587 mova [rsp+gprsize*2+16*24], m3 ;t21a 3588 mova [rsp+gprsize*2+16*29], m0 ;t26a 3589 mova m2, [rsp+gprsize*2+16*26] ;in3 3590 pxor m0, m0 3591 mova m3, m0 3592 pmulhrsw m1, m2, [o(pw_4052x8)] 3593 pmulhrsw m2, [o(pw_m601x8)] 3594 jmp .main2 3595 3596ALIGN function_align 3597.main_fast: ;bottom half is zero 3598 mova m0, [rsp+gprsize*2+16*19] ;in1 3599 mova m1, [rsp+gprsize*2+16*20] ;in15 3600 pmulhrsw m3, m0, [o(pw_4091x8)] ;t31a 3601 pmulhrsw m0, [o(pw_201x8)] ;t16a 3602 pmulhrsw m2, m1, [o(pw_3035x8)] ;t30a 3603 pmulhrsw m1, [o(pw_m2751x8)] ;t17a 3604 mova m7, [o(pd_2048)] 3605 psubsw m4, m0, m1 ;t17 3606 paddsw m0, m1 ;t16 3607 psubsw m5, m3, m2 ;t30 3608 paddsw m3, m2 ;t31 3609 ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a 3610 mova [rsp+gprsize*2+16*19], m0 ;t16 3611 mova [rsp+gprsize*2+16*20], m5 ;t17a 3612 mova [rsp+gprsize*2+16*33], m4 ;t30a 3613 mova [rsp+gprsize*2+16*34], m3 ;t31 3614 mova m0, [rsp+gprsize*2+16*21] ;in9 3615 mova m1, [rsp+gprsize*2+16*22] ;in7 3616 pmulhrsw m3, m0, [o(pw_3703x8)] 3617 pmulhrsw m0, [o(pw_1751x8)] 3618 pmulhrsw m2, m1, [o(pw_3857x8)] 3619 pmulhrsw m1, [o(pw_m1380x8)] 3620 psubsw m4, m1, m0 ;t18 3621 paddsw m0, m1 ;t19 3622 psubsw m5, m2, m3 ;t29 3623 paddsw m3, m2 ;t28 3624 ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a 3625 mova [rsp+gprsize*2+16*21], m5 ;t18a 3626 mova [rsp+gprsize*2+16*22], m0 ;t19 3627 mova [rsp+gprsize*2+16*31], m3 ;t28 3628 mova [rsp+gprsize*2+16*32], m4 ;t29a 3629 mova m0, [rsp+gprsize*2+16*23] ;in5 3630 mova m1, [rsp+gprsize*2+16*24] ;in11 3631 pmulhrsw m3, m0, [o(pw_3973x8)] 3632 pmulhrsw m0, [o(pw_995x8)] 3633 pmulhrsw m2, m1, [o(pw_3513x8)] 3634 pmulhrsw m1, [o(pw_m2106x8)] 3635 psubsw m4, m0, m1 ;t21 3636 paddsw m0, m1 ;t20 3637 psubsw m5, m3, m2 ;t26 3638 paddsw m3, m2 ;t27 3639 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a 3640 mova [rsp+gprsize*2+16*23], m0 ;t20 3641 mova [rsp+gprsize*2+16*24], m5 ;t21a 3642 mova [rsp+gprsize*2+16*29], m4 ;t26a 3643 mova [rsp+gprsize*2+16*30], m3 ;t27 3644 mova m0, [rsp+gprsize*2+16*25] ;in13 3645 mova m2, [rsp+gprsize*2+16*26] ;in3 3646 pmulhrsw m3, m0, [o(pw_3290x8)] 3647 pmulhrsw m0, [o(pw_2440x8)] 3648 pmulhrsw m1, m2, [o(pw_4052x8)] 3649 pmulhrsw m2, [o(pw_m601x8)] 3650 jmp .main2 3651 3652ALIGN function_align 3653.main: 3654 mova m7, [o(pd_2048)] 3655 mova m0, [rsp+gprsize*2+16*19] ;in1 3656 mova m1, [rsp+gprsize*2+16*20] ;in15 3657 mova m2, [rsp+gprsize*2+16*33] ;in17 3658 mova m3, [rsp+gprsize*2+16*34] ;in31 3659 ITX_MULSUB_2W 0, 3, 4, 5, 7, 201, 4091 ;t16a, t31a 3660 ITX_MULSUB_2W 2, 1, 4, 5, 7, 3035, 2751 ;t17a, t30a 3661 psubsw m4, m0, m2 ;t17 3662 paddsw m0, m2 ;t16 3663 psubsw m5, m3, m1 ;t30 3664 paddsw m3, m1 ;t31 3665 ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a 3666 mova [rsp+gprsize*2+16*19], m0 ;t16 3667 mova [rsp+gprsize*2+16*20], m5 ;t17a 3668 mova [rsp+gprsize*2+16*33], m4 ;t30a 3669 mova [rsp+gprsize*2+16*34], m3 ;t31 3670 mova m0, [rsp+gprsize*2+16*21] ;in9 3671 mova m1, [rsp+gprsize*2+16*22] ;in7 3672 mova m2, [rsp+gprsize*2+16*31] ;in25 3673 mova m3, [rsp+gprsize*2+16*32] ;in23 3674 ITX_MULSUB_2W 0, 3, 4, 5, 7, 1751, 3703 ;t18a, t29a 3675 ITX_MULSUB_2W 2, 1, 4, 5, 7, 3857, 1380 ;t19a, t28a 3676 psubsw m4, m2, m0 ;t18 3677 paddsw m0, m2 ;t19 3678 psubsw m5, m1, m3 ;t29 3679 paddsw m3, m1 ;t28 3680 ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a 3681 mova [rsp+gprsize*2+16*21], m5 ;t18a 3682 mova [rsp+gprsize*2+16*22], m0 ;t19 3683 mova [rsp+gprsize*2+16*31], m3 ;t28 3684 mova [rsp+gprsize*2+16*32], m4 ;t29a 3685 mova m0, [rsp+gprsize*2+16*23] ;in5 3686 mova m1, [rsp+gprsize*2+16*24] ;in11 3687 mova m2, [rsp+gprsize*2+16*29] ;in21 3688 mova m3, [rsp+gprsize*2+16*30] ;in27 3689 ITX_MULSUB_2W 0, 3, 4, 5, 7, 995, 3973 ;t20a, t27a 3690 ITX_MULSUB_2W 2, 1, 4, 5, 7, 3513, 2106 ;t21a, t26a 3691 psubsw m4, m0, m2 ;t21 3692 paddsw m0, m2 ;t20 3693 psubsw m5, m3, m1 ;t26 3694 paddsw m3, m1 ;t27 3695 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a 3696 mova [rsp+gprsize*2+16*23], m0 ;t20 3697 mova [rsp+gprsize*2+16*24], m5 ;t21a 3698 mova [rsp+gprsize*2+16*29], m4 ;t26a 3699 mova [rsp+gprsize*2+16*30], m3 ;t27 3700 mova m0, [rsp+gprsize*2+16*25] ;in13 3701 mova m1, [rsp+gprsize*2+16*26] ;in3 3702 mova m2, [rsp+gprsize*2+16*27] ;in29 3703 mova m3, [rsp+gprsize*2+16*28] ;in19 3704 ITX_MULSUB_2W 0, 3, 4, 5, 7, 2440, 3290 ;t22a, t25a 3705 ITX_MULSUB_2W 2, 1, 4, 5, 7, 4052, 601 ;t23a, t24a 3706 3707.main2: 3708 psubsw m4, m2, m0 ;t22 3709 paddsw m0, m2 ;t23 3710 psubsw m5, m1, m3 ;t25 3711 paddsw m3, m1 ;t24 3712 ITX_MULSUB_2W 5, 4, 1, 2, 7, m2276, 3406 ;t22a, t25a 3713 mova m2, [rsp+gprsize*2+16*24] ;t21a 3714 psubsw m1, m5, m2 ;t21 3715 paddsw m5, m2 ;t22 3716 mova [rsp+gprsize*2+16*25], m5 ;t22 3717 mova m2, [rsp+gprsize*2+16*29] ;t26a 3718 psubsw m5, m4, m2 ;t26 3719 paddsw m4, m2 ;t25 3720 mova [rsp+gprsize*2+16*28], m4 ;t25 3721 ITX_MULSUB_2W 5, 1, 2, 4, 7, m3784, 1567 ;t21a, t26a 3722 mova [rsp+gprsize*2+16*24], m5 ;t21a 3723 mova [rsp+gprsize*2+16*29], m1 ;t26a 3724 3725 mova m1, [rsp+gprsize*2+16*23] ;t20 3726 mova m5, [rsp+gprsize*2+16*30] ;t27 3727 psubsw m2, m0, m1 ;t20a 3728 paddsw m0, m1 ;t23a 3729 psubsw m6, m3, m5 ;t27a 3730 paddsw m3, m5 ;t24a 3731 ITX_MULSUB_2W 6, 2, 1, 5, 7, m3784, 1567 ;t20, t27 3732 mova [rsp+gprsize*2+16*26], m0 ;t23a 3733 mova [rsp+gprsize*2+16*27], m3 ;t24a 3734 mova [rsp+gprsize*2+16*30], m2 ;t27 3735 3736 mova m0, [rsp+gprsize*2+16*20] ;t17a 3737 mova m1, [rsp+gprsize*2+16*21] ;t18a 3738 mova m2, [rsp+gprsize*2+16*32] ;t29a 3739 mova m3, [rsp+gprsize*2+16*33] ;t30a 3740 psubsw m4, m0, m1 ;t18 3741 paddsw m0, m1 ;t17 3742 psubsw m5, m3, m2 ;t29 3743 paddsw m3, m2 ;t30 3744 ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t18a, t29a 3745 mova [rsp+gprsize*2+16*20], m0 ;t17 3746 mova [rsp+gprsize*2+16*21], m5 ;t18a 3747 mova [rsp+gprsize*2+16*32], m4 ;t29a 3748 mova [rsp+gprsize*2+16*33], m3 ;t30 3749 mova m0, [rsp+gprsize*2+16*19] ;t16 3750 mova m1, [rsp+gprsize*2+16*22] ;t19 3751 mova m2, [rsp+gprsize*2+16*31] ;t28 3752 mova m3, [rsp+gprsize*2+16*34] ;t31 3753 psubsw m4, m0, m1 ;t19a 3754 paddsw m0, m1 ;t16a 3755 psubsw m5, m3, m2 ;t28a 3756 paddsw m3, m2 ;t31a 3757 ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t19, t28 3758 mova m2, [rsp+gprsize*2+16*15] ;tmp12 3759 psubsw m1, m5, m6 ;t20a 3760 paddsw m5, m6 ;t19a 3761 psubsw m6, m2, m5 ;out19 3762 paddsw m2, m5 ;out12 3763 mova m5, [rsp+gprsize*2+16*30] ;t27 3764 mova [rsp+gprsize*2+16*22], m6 ;out19 3765 mova [rsp+gprsize*2+16*15], m2 ;out12 3766 psubsw m6, m4, m5 ;t27a 3767 paddsw m4, m5 ;t28a 3768 ITX_MULSUB_2W 6, 1, 2, 5, 7, 2896, 2896 ;t20, t27 3769 mova m2, [rsp+gprsize*2+16*6 ] ;tmp3 3770 psubsw m5, m2, m4 ;out28 3771 paddsw m2, m4 ;out3 3772 mova m4, [rsp+gprsize*2+16*14] ;tmp11 3773 mova [rsp+gprsize*2+16*31], m5 ;out28 3774 mova [rsp+gprsize*2+16*6 ], m2 ;out3 3775 psubsw m5, m4, m6 ;out20 3776 paddsw m4, m6 ;out11 3777 mova m2, [rsp+gprsize*2+16*7 ] ;tmp4 3778 mova [rsp+gprsize*2+16*23], m5 ;out20 3779 mova [rsp+gprsize*2+16*14], m4 ;out11 3780 psubsw m5, m2, m1 ;out27 3781 paddsw m2, m1 ;out4 3782 mova m1, [rsp+gprsize*2+16*26] ;t23a 3783 mova m4, [rsp+gprsize*2+16*27] ;t24a 3784 mova [rsp+gprsize*2+16*30], m5 ;out27 3785 mova [rsp+gprsize*2+16*7 ], m2 ;out4 3786 psubsw m5, m0, m1 ;t23 3787 paddsw m0, m1 ;t16 3788 psubsw m2, m3, m4 ;t24 3789 paddsw m3, m4 ;t31 3790 ITX_MULSUB_2W 2, 5, 4, 6, 7, 2896, 2896 ;t23a, t24a 3791 mova m6, [rsp+gprsize*2+16*18] ;tmp15 3792 psubsw m4, m6, m0 ;out16 3793 paddsw m6, m0 ;out15 3794 mova m0, [rsp+gprsize*2+16*3 ] ;tmp0 3795 mova m1, [rsp+gprsize*2+16*11] ;tmp8 3796 mova [rsp+gprsize*2+16*18], m6 ;out15 3797 mova [rsp+gprsize*2+16*19], m4 ;out16 3798 psubsw m6, m0, m3 ;out31 3799 paddsw m0, m3 ;out0 3800 psubsw m4, m1, m2 ;out23 3801 paddsw m1, m2 ;out8 3802 mova m3, [rsp+gprsize*2+16*10] ;tmp7 3803 mova [rsp+gprsize*2+16*34], m6 ;out31 3804 mova [rsp+gprsize*2+16*11], m1 ;out8 3805 mova [rsp+gprsize*2+16*26], m4 ;out23 3806 paddsw m6, m3, m5 ;out7 3807 psubsw m3, m5 ;out24 3808 mova m1, [rsp+gprsize*2+16*20] ;t17 3809 mova m5, [rsp+gprsize*2+16*25] ;t22 3810 mova m2, [rsp+gprsize*2+16*17] ;tmp14 3811 mova [rsp+gprsize*2+16*27], m3 ;out24 3812 psubsw m4, m1, m5 ;t22a 3813 paddsw m1, m5 ;t17a 3814 psubsw m3, m2, m1 ;out17 3815 paddsw m2, m1 ;out14 3816 mova m5, [rsp+gprsize*2+16*28] ;t25 3817 mova m1, [rsp+gprsize*2+16*33] ;t30 3818 mova [rsp+gprsize*2+16*17], m2 ;out14 3819 mova [rsp+gprsize*2+16*20], m3 ;out17 3820 psubsw m2, m1, m5 ;t25a 3821 paddsw m1, m5 ;t30a 3822 ITX_MULSUB_2W 2, 4, 3, 5, 7, 2896, 2896 ;t22, t25 3823 mova m5, [rsp+gprsize*2+16*4 ] ;tmp1 3824 psubsw m3, m5, m1 ;out30 3825 paddsw m5, m1 ;out1 3826 mova m1, [rsp+gprsize*2+16*12] ;tmp9 3827 mova [rsp+gprsize*2+16*33], m3 ;out30 3828 mova [rsp+gprsize*2+16*4 ], m5 ;out1 3829 psubsw m3, m1, m2 ;out22 3830 paddsw m1, m2 ;out9 3831 mova m5, [rsp+gprsize*2+16*9 ] ;tmp6 3832 mova [rsp+gprsize*2+16*25], m3 ;out22 3833 mova [rsp+gprsize*2+16*12], m1 ;out9 3834 psubsw m3, m5, m4 ;out25 3835 paddsw m5, m4 ;out6 3836 mova m4, [rsp+gprsize*2+16*21] ;t18a 3837 mova m1, [rsp+gprsize*2+16*24] ;t21a 3838 mova m2, [rsp+gprsize*2+16*16] ;tmp13 3839 mova [rsp+gprsize*2+16*28], m3 ;out25 3840 mova [rsp+gprsize*2+16*9 ], m5 ;out6 3841 paddsw m3, m4, m1 ;t18 3842 psubsw m4, m1 ;t21 3843 psubsw m5, m2, m3 ;out18 3844 paddsw m2, m3 ;out13 3845 mova m1, [rsp+gprsize*2+16*29] ;t26a 3846 mova m3, [rsp+gprsize*2+16*32] ;t29a 3847 mova [rsp+gprsize*2+16*21], m5 ;out18 3848 mova [rsp+gprsize*2+16*16], m2 ;out13 3849 psubsw m5, m3, m1 ;t26 3850 paddsw m3, m1 ;t29 3851 ITX_MULSUB_2W 5, 4, 1, 2, 7, 2896, 2896 ;t21a, t26a 3852 mova m2, [rsp+gprsize*2+16*5 ] ;tmp2 3853 psubsw m1, m2, m3 ;out29 3854 paddsw m2, m3 ;out2 3855 mova m3, [rsp+gprsize*2+16*13] ;tmp10 3856 mova [rsp+gprsize*2+16*32], m1 ;out29 3857 psubsw m7, m3, m5 ;out21 3858 paddsw m3, m5 ;out10 3859 mova m5, [rsp+gprsize*2+16*8 ] ;tmp5 3860 mova [rsp+gprsize*2+16*24], m7 ;out21 3861 mova [rsp+gprsize*2+16*13], m3 ;out10 3862 psubsw m1, m5, m4 ;out26 3863 paddsw m5, m4 ;out5 3864 mova m7, m6 ;out7 3865 mova m3, [rsp+gprsize*2+16*6 ] ;out3 3866 mova m4, [rsp+gprsize*2+16*7 ] ;out4 3867 mova [rsp+gprsize*2+16*29], m1 ;out26 3868 mova m6, [rsp+gprsize*2+16*9 ] ;out6 3869 mova m1, [rsp+gprsize*2+16*4 ] ;out1 3870 ret 3871 3872 3873cglobal inv_txfm_add_dct_dct_32x8, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 3874%if ARCH_X86_32 3875 LEA r5, $$ 3876%endif 3877 test eobd, eobd 3878 jz .dconly 3879 call m(idct_32x8_internal) 3880 RET 3881 3882.dconly: 3883 movd m1, [o(pw_2896x8)] 3884 pmulhrsw m0, m1, [coeffq] 3885 movd m2, [o(pw_8192)] 3886 mov [coeffq], eobd 3887 mov r3d, 8 3888 lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)] 3889 3890.body: 3891 pmulhrsw m0, m2 3892 movd m2, [o(pw_2048)] ;intentionally rip-relative 3893 pmulhrsw m0, m1 3894 pmulhrsw m0, m2 3895 pshuflw m0, m0, q0000 3896 punpcklwd m0, m0 3897 pxor m5, m5 3898 3899.loop: 3900 mova m1, [dstq+16*0] 3901 mova m3, [dstq+16*1] 3902 punpckhbw m2, m1, m5 3903 punpcklbw m1, m5 3904 punpckhbw m4, m3, m5 3905 punpcklbw m3, m5 3906 paddw m2, m0 3907 paddw m1, m0 3908 paddw m4, m0 3909 paddw m3, m0 3910 packuswb m1, m2 3911 packuswb m3, m4 3912 mova [dstq+16*0], m1 3913 mova [dstq+16*1], m3 3914 add dstq, strideq 3915 dec r3d 3916 jg .loop 3917 jmp tx2q 3918 3919.end: 3920 RET 3921 3922 3923cglobal idct_32x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 3924 %undef cmp 3925 LOAD_8ROWS coeffq+16*0, 64 3926 call m(idct_8x8_internal).main 3927 SAVE_7ROWS rsp+gprsize+16*3, 16 3928 3929 LOAD_8ROWS coeffq+16*2, 64 3930 call m(idct_16x8_internal).main 3931 mova m7, [rsp+gprsize+16*0] 3932 SAVE_8ROWS rsp+gprsize+16*11, 16 3933 3934 LOAD_8ROWS coeffq+16*1, 32 3935 mova [rsp+gprsize+16*19], m0 ;in1 3936 mova [rsp+gprsize+16*26], m1 ;in3 3937 mova [rsp+gprsize+16*23], m2 ;in5 3938 mova [rsp+gprsize+16*22], m3 ;in7 3939 mova [rsp+gprsize+16*21], m4 ;in9 3940 mova [rsp+gprsize+16*24], m5 ;in11 3941 mova [rsp+gprsize+16*25], m6 ;in13 3942 mova [rsp+gprsize+16*20], m7 ;in15 3943 3944 cmp eobd, 106 3945 jg .full 3946 call m(idct_8x32_internal).main_fast 3947 jmp .pass2 3948 3949.full: 3950 LOAD_8ROWS coeffq+16*17, 32 3951 mova [rsp+gprsize+16*33], m0 ;in17 3952 mova [rsp+gprsize+16*28], m1 ;in19 3953 mova [rsp+gprsize+16*29], m2 ;in21 3954 mova [rsp+gprsize+16*32], m3 ;in23 3955 mova [rsp+gprsize+16*31], m4 ;in25 3956 mova [rsp+gprsize+16*30], m5 ;in27 3957 mova [rsp+gprsize+16*27], m6 ;in29 3958 mova [rsp+gprsize+16*34], m7 ;in31 3959 call m(idct_8x32_internal).main 3960 3961.pass2: 3962 mova [rsp+gprsize+16*0 ], m7 3963 lea tx2q, [o(m(idct_32x8_internal).end)] 3964 jmp m(idct_8x32_internal).end1 3965 3966.end: 3967 mova m7, [o(pw_8192)] 3968 lea tx2q, [o(m(idct_32x8_internal).end1)] 3969 jmp m(idct_8x8_internal).pass1_end1 3970 3971.end1: 3972 lea r3, [dstq+8] 3973 lea tx2q, [o(m(idct_32x8_internal).end2)] 3974 jmp m(idct_8x8_internal).pass2_main 3975 3976.end2: 3977 LOAD_8ROWS rsp+gprsize+16*11, 16 3978 mova [rsp+gprsize+16*0 ], m7 3979 mova m7, [o(pw_8192)] 3980 lea tx2q, [o(m(idct_32x8_internal).end3)] 3981 jmp m(idct_8x8_internal).pass1_end1 3982 3983.end3: 3984 mov dstq, r3 3985 add r3, 8 3986 lea tx2q, [o(m(idct_32x8_internal).end4)] 3987 jmp m(idct_8x8_internal).pass2_main 3988 3989.end4: 3990 LOAD_8ROWS rsp+gprsize+16*19, 16 3991 mova [rsp+gprsize+16*0 ], m7 3992 mova m7, [o(pw_8192)] 3993 lea tx2q, [o(m(idct_32x8_internal).end5)] 3994 jmp m(idct_8x8_internal).pass1_end1 3995 3996.end5: 3997 mov dstq, r3 3998 add r3, 8 3999 lea tx2q, [o(m(idct_32x8_internal).end6)] 4000 jmp m(idct_8x8_internal).pass2_main 4001 4002.end6: 4003 LOAD_8ROWS rsp+gprsize+16*27, 16 4004 mova [rsp+gprsize+16*0 ], m7 4005 mova m7, [o(pw_8192)] 4006 lea tx2q, [o(m(idct_32x8_internal).end7)] 4007 jmp m(idct_8x8_internal).pass1_end1 4008 4009.end7: 4010 mov dstq, r3 4011 lea tx2q, [o(m(idct_32x8_internal).end8)] 4012 jmp m(idct_8x8_internal).pass2_main 4013 4014.end8: 4015 ret 4016 4017 4018cglobal inv_txfm_add_identity_identity_8x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 4019 mov r5d, 4 4020 mov tx2d, 2 4021 cmp eobd, 107 4022 cmovns tx2d, r5d 4023 mov r3d, tx2d 4024%if ARCH_X86_32 4025 LEA r5, $$ 4026%endif 4027 lea tx2q, [o(m(idct_32x8_internal).end8)] 4028.loop: 4029 LOAD_8ROWS coeffq+16*0, 64 4030 paddsw m6, [o(pw_5)] 4031 mova [rsp+16*1], m6 4032 mova m6, [o(pw_5)] 4033 REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7 4034 call m(idct_8x8_internal).pass1_end3 4035 REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 4036 mova [rsp+16*2], m5 4037 mova [rsp+16*1], m6 4038 mova [rsp+16*0], m7 4039 call m(idct_8x8_internal).end3 4040 lea dstq, [dstq+strideq*2] 4041 pxor m7, m7 4042 REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 4043 add coeffq, 16 4044 dec r3d 4045 jg .loop 4046 RET 4047 4048cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 4049 mov r5d, 4 4050 mov tx2d, 2 4051 cmp eobd, 107 4052 cmovns tx2d, r5d 4053 mov r3d, tx2d 4054%if ARCH_X86_32 4055 LEA r5, $$ 4056%endif 4057 4058.loop: 4059 LOAD_8ROWS coeffq+16*0, 16 4060 pmulhrsw m6, [o(pw_4096)] 4061 mova [rsp+16*1], m6 4062 mova m6, [o(pw_4096)] 4063 REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 4064 lea tx2q, [o(m(idct_32x8_internal).end8)] 4065 call m(idct_8x8_internal).pass1_end3 4066 4067 mov [rsp+16*3], dstq 4068 mova [rsp+16*2], m5 4069 mova [rsp+16*1], m6 4070 mova [rsp+16*0], m7 4071 lea tx2q, [o(m(idct_8x8_internal).end4)] 4072 call m(idct_8x8_internal).end3 4073 4074 add coeffq, 16*8 4075 mov dstq, [rsp+16*3] 4076 lea dstq, [dstq+8] 4077 dec r3d 4078 jg .loop 4079 jnc .loop 4080 RET 4081 4082 4083cglobal inv_txfm_add_dct_dct_16x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 4084%if ARCH_X86_32 4085 LEA r5, $$ 4086%endif 4087 test eobd, eobd 4088 jz .dconly 4089 call m(idct_16x32_internal) 4090 RET 4091 4092.dconly: 4093 movd m1, [o(pw_2896x8)] 4094 pmulhrsw m0, m1, [coeffq] 4095 movd m2, [o(pw_16384)] 4096 mov [coeffq], eobd 4097 pmulhrsw m0, m1 4098 mov r2d, 16 4099 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x32).end)] 4100 jmp m(inv_txfm_add_dct_dct_16x4).dconly 4101 4102.end: 4103 RET 4104 4105cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 4106 %undef cmp 4107 4108 LOAD_8ROWS coeffq+16*1, 128, 1 4109 call m(idct_8x8_internal).main 4110 SAVE_7ROWS rsp+gprsize+16*3, 16 4111 LOAD_8ROWS coeffq+16*5, 128, 1 4112 call m(idct_16x8_internal).main 4113 lea tx2q, [o(m(idct_16x32_internal).pass1_end)] 4114 jmp m(idct_8x8_internal).pass1_end 4115 4116.pass1_end: 4117 SAVE_8ROWS coeffq+16*33, 64 ;in8~in15 4118 LOAD_8ROWS rsp+gprsize+16*3, 16 4119 mova [rsp+gprsize+16*0], m7 4120 lea tx2q, [o(m(idct_16x32_internal).pass1_end1)] 4121 jmp m(idct_8x8_internal).pass1_end 4122 4123.pass1_end1: 4124 mova [coeffq+16*1 ], m0 ;in8 4125 mova [coeffq+16*5 ], m4 ;in12 4126 mova [rsp+gprsize+16*13], m2 ;in10 4127 mova [rsp+gprsize+16*14], m6 ;in14 4128 mova [rsp+gprsize+16*21], m1 ;in9 4129 mova [rsp+gprsize+16*24], m3 ;in11 4130 mova [rsp+gprsize+16*25], m5 ;in13 4131 mova [rsp+gprsize+16*20], m7 ;in15 4132 LOAD_8ROWS coeffq+16*0, 128, 1 4133 call m(idct_8x8_internal).main 4134 SAVE_7ROWS rsp+gprsize+16*3, 16 4135 LOAD_8ROWS coeffq+16*4, 128, 1 4136 call m(idct_16x8_internal).main 4137 lea tx2q, [o(m(idct_16x32_internal).pass1_end2)] 4138 jmp m(idct_8x8_internal).pass1_end 4139 4140.pass1_end2: 4141 SAVE_8ROWS coeffq+16*32, 64 ;in0~in7 4142 LOAD_8ROWS rsp+gprsize+16*3, 16 4143 mova [rsp+gprsize+16*0], m7 4144 lea tx2q, [o(m(idct_16x32_internal).pass1_end3)] 4145 jmp m(idct_8x8_internal).pass1_end 4146 4147.pass1_end3: 4148 mova [rsp+gprsize+16*11], m2 ;in2 4149 mova [rsp+gprsize+16*12], m6 ;in6 4150 mova [rsp+gprsize+16*19], m1 ;in1 4151 mova [rsp+gprsize+16*26], m3 ;in3 4152 mova [rsp+gprsize+16*23], m5 ;in5 4153 mova [rsp+gprsize+16*22], m7 ;in7 4154 4155 cmp eobd, 150 4156 jg .full 4157 4158 mova m1, m4 ;in4 4159 mova m2, [coeffq+16*1 ] ;in8 4160 mova m3, [coeffq+16*5 ] ;in12 4161 pxor m4, m4 4162 REPX {mova x, m4}, m5, m6, m7 4163 call m(idct_8x8_internal).main 4164 SAVE_7ROWS rsp+gprsize+16*3, 16 4165 mova m0, [rsp+gprsize+16*11] ;in2 4166 mova m1, [rsp+gprsize+16*12] ;in6 4167 mova m2, [rsp+gprsize+16*13] ;in10 4168 mova m3, [rsp+gprsize+16*14] ;in14 4169 pxor m4, m4 4170 REPX {mova x, m4}, m5, m6, m7 4171 call m(idct_16x8_internal).main 4172 mova m7, [rsp+gprsize+16*0] 4173 SAVE_8ROWS rsp+gprsize+16*11, 16 4174 4175 call m(idct_8x32_internal).main_fast 4176 jmp .pass2 4177 4178.full: 4179 mova [coeffq+16*0 ], m0 ;in0 4180 mova [coeffq+16*4 ], m4 ;in4 4181 4182 LOAD_8ROWS coeffq+16*2, 128, 1 4183 call m(idct_8x8_internal).main 4184 SAVE_7ROWS rsp+gprsize+16*3, 16 4185 LOAD_8ROWS coeffq+16*6, 128, 1 4186 call m(idct_16x8_internal).main 4187 lea tx2q, [o(m(idct_16x32_internal).pass1_end4)] 4188 jmp m(idct_8x8_internal).pass1_end 4189 4190.pass1_end4: 4191 SAVE_8ROWS coeffq+16*34, 64 ;in16~in23 4192 LOAD_8ROWS rsp+gprsize+16*3, 16 4193 mova [rsp+gprsize+16*0], m7 4194 lea tx2q, [o(m(idct_16x32_internal).pass1_end5)] 4195 jmp m(idct_8x8_internal).pass1_end 4196 4197.pass1_end5: 4198 mova [coeffq+16*2 ], m0 ;in16 4199 mova [coeffq+16*6 ], m4 ;in20 4200 mova [rsp+gprsize+16*15], m2 ;in18 4201 mova [rsp+gprsize+16*16], m6 ;in22 4202 mova [rsp+gprsize+16*33], m1 ;in17 4203 mova [rsp+gprsize+16*28], m3 ;in19 4204 mova [rsp+gprsize+16*29], m5 ;in21 4205 mova [rsp+gprsize+16*32], m7 ;in23 4206 4207 LOAD_8ROWS coeffq+16*3, 128, 1 4208 call m(idct_8x8_internal).main 4209 SAVE_7ROWS rsp+gprsize+16*3, 16 4210 LOAD_8ROWS coeffq+16*7, 128, 1 4211 call m(idct_16x8_internal).main 4212 lea tx2q, [o(m(idct_16x32_internal).pass1_end6)] 4213 jmp m(idct_8x8_internal).pass1_end 4214 4215.pass1_end6: 4216 SAVE_8ROWS coeffq+16*35, 64 ;in24~in31 4217 LOAD_8ROWS rsp+gprsize+16*3, 16 4218 mova [rsp+gprsize+16*0], m7 4219 lea tx2q, [o(m(idct_16x32_internal).pass1_end7)] 4220 jmp m(idct_8x8_internal).pass1_end 4221 4222.pass1_end7: 4223 mova [rsp+gprsize+16*17], m2 ;in26 4224 mova [rsp+gprsize+16*18], m6 ;in30 4225 mova [rsp+gprsize+16*31], m1 ;in25 4226 mova [rsp+gprsize+16*30], m3 ;in27 4227 mova [rsp+gprsize+16*27], m5 ;in29 4228 mova [rsp+gprsize+16*34], m7 ;in31 4229 4230 mova m6, m0 ;in24 4231 mova m7, m4 ;in28 4232 mova m0, [coeffq+16*0 ] ;in0 4233 mova m1, [coeffq+16*4 ] ;in4 4234 mova m2, [coeffq+16*1 ] ;in8 4235 mova m3, [coeffq+16*5 ] ;in12 4236 mova m4, [coeffq+16*2 ] ;in16 4237 mova m5, [coeffq+16*6 ] ;in20 4238 call m(idct_8x8_internal).main 4239 SAVE_7ROWS rsp+gprsize+16*3 , 16 4240 LOAD_8ROWS rsp+gprsize+16*11, 16 4241 call m(idct_16x8_internal).main 4242 mova m7, [rsp+gprsize+16*0] 4243 SAVE_8ROWS rsp+gprsize+16*11, 16 4244 4245 call m(idct_8x32_internal).main 4246 4247.pass2: 4248 mov [rsp+gprsize*1+16*35], eobd 4249 lea r3, [dstq+8] 4250 mov [rsp+gprsize*2+16*35], r3 4251 lea r3, [o(m(idct_16x32_internal).end)] 4252 jmp m(idct_8x32_internal).end 4253 4254.end: 4255 mov dstq, [rsp+gprsize*2+16*35] 4256 mov eobd, [rsp+gprsize*1+16*35] 4257 add coeffq, 16*32 4258 4259 mova m0, [coeffq+16*4 ] ;in1 4260 mova m1, [coeffq+16*12] ;in3 4261 mova m2, [coeffq+16*20] ;in5 4262 mova m3, [coeffq+16*28] ;in7 4263 mova m4, [coeffq+16*5 ] ;in9 4264 mova m5, [coeffq+16*13] ;in11 4265 mova m6, [coeffq+16*21] ;in13 4266 mova m7, [coeffq+16*29] ;in15 4267 4268 mova [rsp+gprsize+16*19], m0 ;in1 4269 mova [rsp+gprsize+16*26], m1 ;in3 4270 mova [rsp+gprsize+16*23], m2 ;in5 4271 mova [rsp+gprsize+16*22], m3 ;in7 4272 mova [rsp+gprsize+16*21], m4 ;in9 4273 mova [rsp+gprsize+16*24], m5 ;in11 4274 mova [rsp+gprsize+16*25], m6 ;in13 4275 mova [rsp+gprsize+16*20], m7 ;in15 4276 4277 mova m0, [coeffq+16*0 ] ;in0 4278 mova m1, [coeffq+16*16] ;in4 4279 mova m2, [coeffq+16*1 ] ;in8 4280 mova m3, [coeffq+16*17] ;in12 4281 4282 cmp eobd, 150 4283 jg .full1 4284 4285 pxor m4, m4 4286 REPX {mova x, m4}, m5, m6, m7 4287 call m(idct_8x8_internal).main 4288 SAVE_7ROWS rsp+gprsize+16*3, 16 4289 4290 mova m0, [coeffq+16*8 ] ;in2 4291 mova m1, [coeffq+16*24] ;in6 4292 mova m2, [coeffq+16*9 ] ;in10 4293 mova m3, [coeffq+16*25] ;in14 4294 pxor m4, m4 4295 REPX {mova x, m4}, m5, m6, m7 4296 call m(idct_16x8_internal).main 4297 mova m7, [rsp+gprsize+16*0] 4298 SAVE_8ROWS rsp+gprsize+16*11, 16 4299 4300 call m(idct_8x32_internal).main_fast 4301 jmp .end1 4302 4303.full1: 4304 mova m4, [coeffq+16*2 ] ;in16 4305 mova m5, [coeffq+16*18] ;in20 4306 mova m6, [coeffq+16*3 ] ;in24 4307 mova m7, [coeffq+16*19] ;in26 4308 call m(idct_8x8_internal).main 4309 SAVE_7ROWS rsp+gprsize+16*3, 16 4310 4311 mova m0, [coeffq+16*8 ] ;in2 4312 mova m1, [coeffq+16*24] ;in6 4313 mova m2, [coeffq+16*9 ] ;in10 4314 mova m3, [coeffq+16*25] ;in14 4315 mova m4, [coeffq+16*10] ;in18 4316 mova m5, [coeffq+16*26] ;in22 4317 mova m6, [coeffq+16*11] ;in26 4318 mova m7, [coeffq+16*27] ;in30 4319 call m(idct_16x8_internal).main 4320 mova m7, [rsp+gprsize+16*0] 4321 SAVE_8ROWS rsp+gprsize+16*11, 16 4322 4323 mova m0, [coeffq+16*6 ] ;in17 4324 mova m1, [coeffq+16*14] ;in19 4325 mova m2, [coeffq+16*22] ;in21 4326 mova m3, [coeffq+16*30] ;in23 4327 mova m4, [coeffq+16*7 ] ;in25 4328 mova m5, [coeffq+16*15] ;in27 4329 mova m6, [coeffq+16*23] ;in29 4330 mova m7, [coeffq+16*31] ;in31 4331 4332 mova [rsp+gprsize+16*33], m0 ;in17 4333 mova [rsp+gprsize+16*28], m1 ;in19 4334 mova [rsp+gprsize+16*29], m2 ;in21 4335 mova [rsp+gprsize+16*32], m3 ;in23 4336 mova [rsp+gprsize+16*31], m4 ;in25 4337 mova [rsp+gprsize+16*30], m5 ;in27 4338 mova [rsp+gprsize+16*27], m6 ;in29 4339 mova [rsp+gprsize+16*34], m7 ;in31 4340 4341 call m(idct_8x32_internal).main 4342 4343.end1: 4344 jmp m(idct_8x32_internal).pass2 4345 4346 4347 4348cglobal inv_txfm_add_dct_dct_32x16, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 4349%if ARCH_X86_32 4350 LEA r5, $$ 4351%endif 4352 test eobd, eobd 4353 jz .dconly 4354 4355 call m(idct_32x16_internal) 4356 call m(idct_8x16_internal).pass2 4357 4358 add coeffq, 16*16 4359 lea dstq, [r3+8] 4360 LOAD_8ROWS rsp+16*11, 16 4361 mova [rsp+16*0], m7 4362 lea tx2q, [o(m(idct_32x16_internal).end)] 4363 call m(idct_8x8_internal).pass1_end 4364 call m(idct_8x16_internal).pass2 4365 4366 add coeffq, 16*16 4367 lea dstq, [r3+8] 4368 LOAD_8ROWS rsp+16*19, 16 4369 mova [rsp+16*0], m7 4370 lea tx2q, [o(m(idct_32x16_internal).end)] 4371 call m(idct_8x8_internal).pass1_end 4372 call m(idct_8x16_internal).pass2 4373 4374 add coeffq, 16*16 4375 lea dstq, [r3+8] 4376 LOAD_8ROWS rsp+16*27, 16 4377 mova [rsp+16*0], m7 4378 lea tx2q, [o(m(idct_32x16_internal).end)] 4379 call m(idct_8x8_internal).pass1_end 4380 call m(idct_8x16_internal).pass2 4381 RET 4382 4383.dconly: 4384 movd m1, [o(pw_2896x8)] 4385 pmulhrsw m0, m1, [coeffq] 4386 movd m2, [o(pw_16384)] 4387 mov [coeffq], eobd 4388 pmulhrsw m0, m1 4389 mov r3d, 16 4390 lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)] 4391 jmp m(inv_txfm_add_dct_dct_32x8).body 4392 4393 4394cglobal idct_32x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 4395 %undef cmp 4396 4397 add coeffq, 16 4398 lea r3, [o(m(idct_32x16_internal).pass1_end1)] 4399.pass1: 4400 LOAD_8ROWS coeffq+16*0, 128, 1 4401 call m(idct_8x8_internal).main 4402 SAVE_7ROWS rsp+gprsize+16*3, 16 4403 4404 LOAD_8ROWS coeffq+16*4, 128, 1 4405 call m(idct_16x8_internal).main 4406 mova m7, [rsp+gprsize+16*0] 4407 SAVE_8ROWS rsp+gprsize+16*11, 16 4408 4409 LOAD_8ROWS coeffq+16*2, 64, 1 4410 mova [rsp+gprsize+16*19], m0 ;in1 4411 mova [rsp+gprsize+16*26], m1 ;in3 4412 mova [rsp+gprsize+16*23], m2 ;in5 4413 mova [rsp+gprsize+16*22], m3 ;in7 4414 mova [rsp+gprsize+16*21], m4 ;in9 4415 mova [rsp+gprsize+16*24], m5 ;in11 4416 mova [rsp+gprsize+16*25], m6 ;in13 4417 mova [rsp+gprsize+16*20], m7 ;in15 4418 4419 LOAD_8ROWS coeffq+16*34, 64, 1 4420 mova [rsp+gprsize+16*33], m0 ;in17 4421 mova [rsp+gprsize+16*28], m1 ;in19 4422 mova [rsp+gprsize+16*29], m2 ;in21 4423 mova [rsp+gprsize+16*32], m3 ;in23 4424 mova [rsp+gprsize+16*31], m4 ;in25 4425 mova [rsp+gprsize+16*30], m5 ;in27 4426 mova [rsp+gprsize+16*27], m6 ;in29 4427 mova [rsp+gprsize+16*34], m7 ;in31 4428 call m(idct_8x32_internal).main 4429 4430.pass1_end: 4431 mova [rsp+gprsize+16*0 ], m7 4432 mov tx2q, r3 4433 jmp m(idct_8x8_internal).pass1_end 4434 4435.pass1_end1: 4436 SAVE_8ROWS coeffq+16*0, 32 4437 LOAD_8ROWS rsp+gprsize+16*11, 16 4438 mova [rsp+gprsize+16*0 ], m7 4439 lea tx2q, [o(m(idct_32x16_internal).pass1_end2)] 4440 jmp m(idct_8x8_internal).pass1_end 4441 4442.pass1_end2: 4443 SAVE_8ROWS coeffq+16*16, 32 4444 LOAD_8ROWS rsp+gprsize+16*19, 16 4445 mova [rsp+gprsize+16*0 ], m7 4446 lea tx2q, [o(m(idct_32x16_internal).pass1_end3)] 4447 jmp m(idct_8x8_internal).pass1_end 4448 4449.pass1_end3: 4450 SAVE_8ROWS coeffq+16*32, 32 4451 LOAD_8ROWS rsp+gprsize+16*27, 16 4452 mova [rsp+gprsize+16*0 ], m7 4453 lea tx2q, [o(m(idct_32x16_internal).pass1_end4)] 4454 jmp m(idct_8x8_internal).pass1_end 4455 4456.pass1_end4: 4457 SAVE_8ROWS coeffq+16*48, 32 4458 4459 sub coeffq, 16 4460 lea r3, [o(m(idct_32x16_internal).end)] 4461 jmp .pass1 4462 4463.end: 4464 ret 4465 4466 4467cglobal inv_txfm_add_identity_identity_16x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 4468 %undef cmp 4469 4470 mov r4d, eobd 4471 cmp eobd, 43 ;if (eob > 43) 4472 sbb r3d, r3d ; iteration_count++ 4473 cmp r4d, 150 ;if (eob > 150) 4474 sbb r3d, 0 ; iteration_count++ 4475 cmp r4d, 278 ;if (eob > 278) 4476 sbb r3d, -4 ; iteration_count++ 4477 4478%if ARCH_X86_32 4479 LEA r5, $$ 4480%endif 4481 lea r4, [dstq+8] 4482 mov [rsp+16*3], r4 4483 mov [rsp+gprsize+16*3], r3d 4484 mov [rsp+gprsize*2+16*3], coeffq 4485 4486.loop: 4487 LOAD_8ROWS coeffq, 64, 1 4488 mova [rsp+16*1], m6 4489 pxor m6, m6 4490 REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 4491 lea tx2q, [o(m(idct_32x16_internal).end)] 4492 call m(idct_8x8_internal).pass1_end3 4493 mova [rsp+16*0], m2 4494 mova [rsp+16*1], m3 4495 mova [rsp+16*2], m4 4496 mova m3, [o(pw_1697x16)] 4497 mova m4, [o(pw_16384)] 4498 REPX {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1 4499 mova m2, [o(pw_8192)] 4500 REPX {pmulhrsw x, m2}, m5, m6, m7, m0, m1 4501 mova m2, [rsp+16*0] 4502 mova [rsp+16*0], m7 4503 IDTX16 2, 7, 3, 4 4504 mova m7, [rsp+16*2] 4505 mova [rsp+16*2], m5 4506 IDTX16 7, 5, 3, 4 4507 mova m5, [rsp+16*1] 4508 mova [rsp+16*1], m6 4509 pmulhrsw m3, m5 4510 pmulhrsw m3, m4 4511 psrlw m4, 1 ; pw_8192 4512 paddsw m3, m5 4513 pmulhrsw m2, m4 4514 pmulhrsw m3, m4 4515 pmulhrsw m4, m7 4516 call m(idct_8x8_internal).end3 4517 lea dstq, [dstq+strideq*2] 4518 add coeffq, 16 4519 dec r3d 4520 jg .loop 4521 mov coeffq, [rsp+gprsize*2+16*3] 4522 add coeffq, 64*8 4523 mov r3d, [rsp+gprsize+16*3] 4524 xor dstq, dstq 4525 mov [rsp+gprsize+16*3], dstq 4526 mov dstq, [rsp+16*3] 4527 test r3d, r3d 4528 jnz .loop 4529 RET 4530 4531 4532cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 4533 %undef cmp 4534 4535 mov r4d, 12 ;0100b 4536 mov r5d, 136 ;1000 1000b 4537 cmp eobd, 44 ;if (eob > 43) 4538 cmovns r4d, r5d ; iteration_count+2 4539 cmp eobd, 151 ;if (eob > 150) 4540 mov r3d, 34952 ;1000 1000 1000 1000b 4541 cmovs r3d, r4d ; iteration_count += 4 4542 4543%if ARCH_X86_32 4544 LEA r5, $$ 4545%endif 4546 lea r4, [dstq+8] 4547 mov [rsp+16*3], r4 4548 4549.loop: 4550 LOAD_8ROWS coeffq, 32, 1 4551 REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 4552 mova [rsp+16*1], m6 4553 lea tx2q, [o(m(idct_32x16_internal).end)] 4554 call m(idct_8x8_internal).pass1_end3 4555 mova [rsp+16*1], m5 4556 mova [rsp+16*2], m6 4557 mova m6, [o(pw_1697x16)] 4558 REPX {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4 4559 pmulhrsw m7, [o(pw_2048)] 4560 mova m5, [rsp+16*1] 4561 mova [rsp+16*0], m7 4562 IDTX16 5, 7, 6 4563 mova m7, [rsp+16*2] 4564 IDTX16 7, 6, 6 4565 mova m6, [o(pw_2048)] 4566 REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 4567 mova [rsp+16*2], m5 4568 mova [rsp+16*1], m7 4569 call m(idct_8x8_internal).end3 4570 lea dstq, [dstq+strideq*2] 4571 pxor m7, m7 4572 REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 4573 4574.loop_end: 4575 add coeffq, 16 4576 shr r3d, 2 4577 jz .ret 4578 test r3d, 2 4579 jnz .loop 4580 mov r4d, r3d 4581 and r4d, 1 4582 lea coeffq, [coeffq+r4*8+32*7] 4583 mov dstq, [rsp+16*3] 4584 lea r4, [dstq+8] 4585 mov [rsp+16*3], r4 4586 jmp .loop 4587 4588.ret: 4589 RET 4590 4591 4592cglobal inv_txfm_add_dct_dct_32x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 4593%if ARCH_X86_32 4594 LEA r5, $$ 4595%endif 4596 test eobd, eobd 4597 jz .dconly 4598 4599 call m(idct_32x32_internal) 4600 RET 4601 4602.dconly: 4603 movd m1, [o(pw_2896x8)] 4604 pmulhrsw m0, m1, [coeffq] 4605 movd m2, [o(pw_8192)] 4606 mov [coeffq], eobd 4607 mov r3d, 32 4608 lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)] 4609 jmp m(inv_txfm_add_dct_dct_32x8).body 4610 4611 4612cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 4613 %undef cmp 4614 4615 mov r4d, 2 4616 sub eobd, 136 4617 mov [rsp+gprsize*1+16*35], eobd 4618 mov r3d, 4 4619 cmovs r3d, r4d 4620 4621%if ARCH_X86_32 4622 LEA r5, $$ 4623%endif 4624 4625 mov [rsp+gprsize*2+16*35], coeffq 4626 4627.pass1_loop: 4628 LOAD_8ROWS coeffq+64*1, 64*2 4629 mova [rsp+gprsize+16*19], m0 ;in1 4630 mova [rsp+gprsize+16*26], m1 ;in3 4631 mova [rsp+gprsize+16*23], m2 ;in5 4632 mova [rsp+gprsize+16*22], m3 ;in7 4633 mova [rsp+gprsize+16*21], m4 ;in9 4634 mova [rsp+gprsize+16*24], m5 ;in11 4635 mova [rsp+gprsize+16*25], m6 ;in13 4636 mova [rsp+gprsize+16*20], m7 ;in15 4637 4638 mov tx2d, [rsp+gprsize*1+16*35] 4639 test tx2d, tx2d 4640 jl .fast 4641 4642.full: 4643 LOAD_8ROWS coeffq+64*0, 64*4 4644 call m(idct_8x8_internal).main 4645 SAVE_7ROWS rsp+gprsize+16*3, 16 4646 LOAD_8ROWS coeffq+64*2, 64*4 4647 call m(idct_16x8_internal).main 4648 mova m7, [rsp+gprsize+16*0] 4649 SAVE_8ROWS rsp+gprsize+16*11, 16 4650 4651 LOAD_8ROWS coeffq+64*17, 64*2 4652 mova [rsp+gprsize+16*33], m0 ;in17 4653 mova [rsp+gprsize+16*28], m1 ;in19 4654 mova [rsp+gprsize+16*29], m2 ;in21 4655 mova [rsp+gprsize+16*32], m3 ;in23 4656 mova [rsp+gprsize+16*31], m4 ;in25 4657 mova [rsp+gprsize+16*30], m5 ;in27 4658 mova [rsp+gprsize+16*27], m6 ;in29 4659 mova [rsp+gprsize+16*34], m7 ;in31 4660 4661 call m(idct_8x32_internal).main 4662 jmp .pass1_end 4663 4664.fast: 4665 mova m0, [coeffq+256*0] 4666 mova m1, [coeffq+256*1] 4667 mova m2, [coeffq+256*2] 4668 mova m3, [coeffq+256*3] 4669 pxor m4, m4 4670 REPX {mova x, m4}, m5, m6, m7 4671 call m(idct_8x8_internal).main 4672 4673 SAVE_7ROWS rsp+gprsize+16*3, 16 4674 mova m0, [coeffq+128*1] 4675 mova m1, [coeffq+128*3] 4676 mova m2, [coeffq+128*5] 4677 mova m3, [coeffq+128*7] 4678 pxor m4, m4 4679 REPX {mova x, m4}, m5, m6, m7 4680 call m(idct_16x8_internal).main 4681 mova m7, [rsp+gprsize+16*0] 4682 SAVE_8ROWS rsp+gprsize+16*11, 16 4683 4684 call m(idct_8x32_internal).main_fast 4685 4686.pass1_end: 4687 mova [rsp+gprsize+16*0], m7 4688 mova m7, [o(pw_8192)] 4689 lea tx2q, [o(m(idct_32x32_internal).pass1_end1)] 4690 jmp m(idct_8x8_internal).pass1_end1 4691 4692.pass1_end1: 4693 SAVE_8ROWS coeffq+64*0, 64 4694 LOAD_8ROWS rsp+gprsize+16*11, 16 4695 mova [rsp+gprsize+16*0], m7 4696 mova m7, [o(pw_8192)] 4697 lea tx2q, [o(m(idct_32x32_internal).pass1_end2)] 4698 jmp m(idct_8x8_internal).pass1_end1 4699 4700.pass1_end2: 4701 SAVE_8ROWS coeffq+64*8, 64 4702 LOAD_8ROWS rsp+gprsize+16*19, 16 4703 mova [rsp+gprsize+16*0], m7 4704 mova m7, [o(pw_8192)] 4705 lea tx2q, [o(m(idct_32x32_internal).pass1_end3)] 4706 jmp m(idct_8x8_internal).pass1_end1 4707 4708.pass1_end3: 4709 SAVE_8ROWS coeffq+64*16, 64 4710 LOAD_8ROWS rsp+gprsize+16*27, 16 4711 mova [rsp+gprsize+16*0], m7 4712 mova m7, [o(pw_8192)] 4713 lea tx2q, [o(m(idct_32x32_internal).pass1_end4)] 4714 jmp m(idct_8x8_internal).pass1_end1 4715 4716.pass1_end4: 4717 SAVE_8ROWS coeffq+64*24, 64 4718 4719 add coeffq, 16 4720 dec r3d 4721 jg .pass1_loop 4722 4723 4724.pass2: 4725 mov coeffq, [rsp+gprsize*2+16*35] 4726 mov r3d, 4 4727 lea tx2q, [o(m(idct_32x32_internal).pass2_end)] 4728 4729.pass2_loop: 4730 mov [rsp+gprsize*3+16*35], r3d 4731 lea r3, [dstq+8] 4732 mov [rsp+gprsize*2+16*35], r3 4733 4734 mova m0, [coeffq+16*4 ] 4735 mova m1, [coeffq+16*12] 4736 mova m2, [coeffq+16*20] 4737 mova m3, [coeffq+16*28] 4738 mova m4, [coeffq+16*5 ] 4739 mova m5, [coeffq+16*13] 4740 mova m6, [coeffq+16*21] 4741 mova m7, [coeffq+16*29] 4742 mova [rsp+gprsize+16*19], m0 ;in1 4743 mova [rsp+gprsize+16*26], m1 ;in3 4744 mova [rsp+gprsize+16*23], m2 ;in5 4745 mova [rsp+gprsize+16*22], m3 ;in7 4746 mova [rsp+gprsize+16*21], m4 ;in9 4747 mova [rsp+gprsize+16*24], m5 ;in11 4748 mova [rsp+gprsize+16*25], m6 ;in13 4749 mova [rsp+gprsize+16*20], m7 ;in15 4750 4751 mov eobd, [rsp+gprsize*1+16*35] 4752 test eobd, eobd 4753 jl .fast1 4754 4755.full1: 4756 mova m0, [coeffq+16*0 ] 4757 mova m1, [coeffq+16*16] 4758 mova m2, [coeffq+16*1 ] 4759 mova m3, [coeffq+16*17] 4760 mova m4, [coeffq+16*2 ] 4761 mova m5, [coeffq+16*18] 4762 mova m6, [coeffq+16*3 ] 4763 mova m7, [coeffq+16*19] 4764 call m(idct_8x8_internal).main 4765 SAVE_7ROWS rsp+gprsize+16*3, 16 4766 4767 mova m0, [coeffq+16*8 ] 4768 mova m1, [coeffq+16*24] 4769 mova m2, [coeffq+16*9 ] 4770 mova m3, [coeffq+16*25] 4771 mova m4, [coeffq+16*10] 4772 mova m5, [coeffq+16*26] 4773 mova m6, [coeffq+16*11] 4774 mova m7, [coeffq+16*27] 4775 call m(idct_16x8_internal).main 4776 mova m7, [rsp+gprsize+16*0] 4777 SAVE_8ROWS rsp+gprsize+16*11, 16 4778 4779 mova m0, [coeffq+16*6 ] 4780 mova m1, [coeffq+16*14] 4781 mova m2, [coeffq+16*22] 4782 mova m3, [coeffq+16*30] 4783 mova m4, [coeffq+16*7 ] 4784 mova m5, [coeffq+16*15] 4785 mova m6, [coeffq+16*23] 4786 mova m7, [coeffq+16*31] 4787 mova [rsp+gprsize+16*33], m0 ;in17 4788 mova [rsp+gprsize+16*28], m1 ;in19 4789 mova [rsp+gprsize+16*29], m2 ;in21 4790 mova [rsp+gprsize+16*32], m3 ;in23 4791 mova [rsp+gprsize+16*31], m4 ;in25 4792 mova [rsp+gprsize+16*30], m5 ;in27 4793 mova [rsp+gprsize+16*27], m6 ;in29 4794 mova [rsp+gprsize+16*34], m7 ;in31 4795 4796 call m(idct_8x32_internal).main 4797 jmp tx2q 4798 4799.fast1: 4800 mova m0, [coeffq+16*0 ] 4801 mova m1, [coeffq+16*16] 4802 mova m2, [coeffq+16*1 ] 4803 mova m3, [coeffq+16*17] 4804 pxor m4, m4 4805 REPX {mova x, m4}, m5, m6, m7 4806 call m(idct_8x8_internal).main 4807 SAVE_7ROWS rsp+gprsize+16*3, 16 4808 4809 mova m0, [coeffq+16*8 ] 4810 mova m1, [coeffq+16*24] 4811 mova m2, [coeffq+16*9 ] 4812 mova m3, [coeffq+16*25] 4813 pxor m4, m4 4814 REPX {mova x, m4}, m5, m6, m7 4815 call m(idct_16x8_internal).main 4816 mova m7, [rsp+gprsize+16*0] 4817 SAVE_8ROWS rsp+gprsize+16*11, 16 4818 4819 call m(idct_8x32_internal).main_fast 4820 jmp tx2q 4821 4822.pass2_end: 4823 lea r3, [o(m(idct_32x32_internal).pass2_end1)] 4824 jmp m(idct_8x32_internal).end 4825 4826.pass2_end1: 4827 lea tx2q, [o(m(idct_32x32_internal).pass2_end)] 4828 add coeffq, 16*32 4829 mov dstq, [rsp+gprsize*2+16*35] 4830 mov r3d, [rsp+gprsize*3+16*35] 4831 dec r3d 4832 jg .pass2_loop 4833 4834 ret 4835 4836 4837cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 4838 %undef cmp 4839 4840 mov r4d, 2 4841 cmp eobd, 136 4842 mov r3d, 4 4843 cmovs r3d, r4d 4844 4845%if ARCH_X86_32 4846 LEA r5, $$ 4847%endif 4848 4849 lea r4, [dstq+8] 4850 mov [rsp+gprsize*0+16*3], r4 4851 mov [rsp+gprsize*1+16*3], r3d 4852 mov [rsp+gprsize*2+16*3], r3d 4853 mov [rsp+gprsize*3+16*3], coeffq 4854 4855.loop: 4856 LOAD_8ROWS coeffq, 64 4857 mova [rsp+16*1], m6 4858 lea tx2q, [o(m(idct_32x16_internal).end)] 4859 call m(idct_8x8_internal).pass1_end3 4860 pmulhrsw m7, [o(pw_8192)] 4861 mova [rsp+16*0], m7 4862 mova m7, [o(pw_8192)] 4863 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 4864 mova [rsp+16*1], m6 4865 mova [rsp+16*2], m5 4866 call m(idct_8x8_internal).end3 4867 lea dstq, [dstq+strideq*2] 4868 4869 pxor m7, m7 4870 REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 4871 4872 add coeffq, 16 4873 dec r3d 4874 jg .loop 4875 4876 mov r4d, [rsp+gprsize*2+16*3] 4877 dec r4d 4878 jle .ret 4879 4880 mov dstq, [rsp+gprsize*0+16*3] 4881 mov coeffq, [rsp+gprsize*3+16*3] 4882 mov [rsp+gprsize*2+16*3], r4 4883 lea r3, [dstq+8] 4884 add coeffq, 64*8 4885 mov [rsp+gprsize*0+16*3], r3 4886 mov r3d, [rsp+gprsize*1+16*3] 4887 mov [rsp+gprsize*3+16*3], coeffq 4888 jmp .loop 4889 4890.ret: 4891 RET 4892 4893 4894cglobal inv_txfm_add_dct_dct_16x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 4895%if ARCH_X86_32 4896 LEA r5, $$ 4897%endif 4898 test eobd, eobd 4899 jz .dconly 4900 4901 call m(idct_16x64_internal) 4902 RET 4903 4904.dconly: 4905 movd m1, [o(pw_2896x8)] 4906 pmulhrsw m0, m1, [coeffq] 4907 movd m2, [o(pw_8192)] 4908 mov [coeffq], eobd 4909 mov r2d, 32 4910 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x64).end)] 4911 jmp m(inv_txfm_add_dct_dct_16x4).dconly 4912 4913.end: 4914 RET 4915 4916 4917cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 4918 %undef cmp 4919 4920 mov r4d, 2 4921 sub eobd, 151 4922 mov [rsp+gprsize*1+16*67], eobd 4923 mov r3d, 4 4924 cmovs r3d, r4d 4925 4926%if ARCH_X86_32 4927 LEA r5, $$ 4928%endif 4929 4930 mov [rsp+gprsize*2+16*67], coeffq 4931 4932.pass1_loop: 4933 LOAD_8ROWS coeffq+64*0, 64*2 4934 call m(idct_8x8_internal).main 4935 SAVE_7ROWS rsp+gprsize+16*3, 16 4936 LOAD_8ROWS coeffq+64*1, 64*2 4937 call m(idct_16x8_internal).main 4938 mova m7, [o(pw_8192)] 4939 lea tx2q, [o(m(idct_16x64_internal).pass1_end)] 4940 jmp m(idct_8x8_internal).pass1_end1 4941 4942.pass1_end: 4943 SAVE_8ROWS coeffq+64*8, 64 4944 LOAD_8ROWS rsp+gprsize+16*3, 16 4945 mova [rsp+gprsize+16*0], m7 4946 mova m7, [o(pw_8192)] 4947 lea tx2q, [o(m(idct_16x64_internal).pass1_end1)] 4948 jmp m(idct_8x8_internal).pass1_end1 4949 4950.pass1_end1: 4951 SAVE_8ROWS coeffq+64*0, 64 4952 4953 add coeffq, 16 4954 dec r3d 4955 jg .pass1_loop 4956 4957 mov coeffq, [rsp+gprsize*2+16*67] 4958 mov r3d, 2 4959 lea r4, [dstq+8] 4960 mov [rsp+gprsize*2+16*67], r4 4961 lea r4, [o(m(idct_16x64_internal).end1)] 4962 4963.pass2_loop: 4964 mov [rsp+gprsize*3+16*67], r3d 4965 mov eobd, [rsp+gprsize*1+16*67] 4966 4967 mova m0, [coeffq+16*4 ] ;in1 4968 mova m1, [coeffq+16*12] ;in3 4969 mova m2, [coeffq+16*20] ;in5 4970 mova m3, [coeffq+16*28] ;in7 4971 mova m4, [coeffq+16*5 ] ;in9 4972 mova m5, [coeffq+16*13] ;in11 4973 mova m6, [coeffq+16*21] ;in13 4974 mova m7, [coeffq+16*29] ;in15 4975 mova [rsp+gprsize+16*35], m0 ;in1 4976 mova [rsp+gprsize+16*49], m1 ;in3 4977 mova [rsp+gprsize+16*43], m2 ;in5 4978 mova [rsp+gprsize+16*41], m3 ;in7 4979 mova [rsp+gprsize+16*39], m4 ;in9 4980 mova [rsp+gprsize+16*45], m5 ;in11 4981 mova [rsp+gprsize+16*47], m6 ;in13 4982 mova [rsp+gprsize+16*37], m7 ;in15 4983 4984 pxor m4, m4 4985 mova m0, [coeffq+16*0] 4986 mova m1, [coeffq+16*1] 4987 4988 test eobd, eobd 4989 jl .fast 4990 4991.full: 4992 mova m2, [coeffq+16*2] 4993 mova m3, [coeffq+16*3] 4994 4995 REPX {mova x, m4}, m5, m6, m7 4996 call m(idct_8x8_internal).main 4997 SAVE_7ROWS rsp+gprsize+16*3, 16 4998 4999 pxor m4, m4 5000 mova m0, [coeffq+16*16] 5001 mova m1, [coeffq+16*17] 5002 mova m2, [coeffq+16*18] 5003 mova m3, [coeffq+16*19] 5004 5005 REPX {mova x, m4}, m5, m6, m7 5006 call m(idct_16x8_internal).main 5007 mova m7, [rsp+gprsize+16*0] 5008 SAVE_8ROWS rsp+gprsize+16*11, 16 5009 5010 mova m0, [coeffq+16*8 ] 5011 mova m1, [coeffq+16*24] 5012 mova m2, [coeffq+16*9 ] 5013 mova m3, [coeffq+16*25] 5014 mova m4, [coeffq+16*10] 5015 mova m5, [coeffq+16*26] 5016 mova m6, [coeffq+16*11] 5017 mova m7, [coeffq+16*27] 5018 mova [rsp+gprsize+16*19], m0 5019 mova [rsp+gprsize+16*26], m1 5020 mova [rsp+gprsize+16*23], m2 5021 mova [rsp+gprsize+16*22], m3 5022 mova [rsp+gprsize+16*21], m4 5023 mova [rsp+gprsize+16*24], m5 5024 mova [rsp+gprsize+16*25], m6 5025 mova [rsp+gprsize+16*20], m7 5026 5027 call m(idct_8x32_internal).main_fast 5028 SAVE_8ROWS rsp+gprsize+16*3, 16 5029 5030 mova m0, [coeffq+16*6 ] ;in17 5031 mova m1, [coeffq+16*14] ;in19 5032 mova m2, [coeffq+16*22] ;in21 5033 mova m3, [coeffq+16*30] ;in23 5034 mova m4, [coeffq+16*7 ] ;in25 5035 mova m5, [coeffq+16*15] ;in27 5036 mova m6, [coeffq+16*23] ;in29 5037 mova m7, [coeffq+16*31] ;in31 5038 mova [rsp+gprsize+16*63], m0 ;in17 5039 mova [rsp+gprsize+16*53], m1 ;in19 5040 mova [rsp+gprsize+16*55], m2 ;in21 5041 mova [rsp+gprsize+16*61], m3 ;in23 5042 mova [rsp+gprsize+16*59], m4 ;in25 5043 mova [rsp+gprsize+16*57], m5 ;in27 5044 mova [rsp+gprsize+16*51], m6 ;in29 5045 mova [rsp+gprsize+16*65], m7 ;in31 5046 5047 call .main 5048 jmp .end 5049 5050.fast: 5051 REPX {mova x, m4}, m2, m3, m5, m6, m7 5052 call m(idct_8x8_internal).main 5053 SAVE_7ROWS rsp+gprsize+16*3, 16 5054 5055 pxor m4, m4 5056 mova m0, [coeffq+16*16] 5057 mova m1, [coeffq+16*17] 5058 5059 REPX {mova x, m4}, m2, m3, m5, m6, m7 5060 call m(idct_16x8_internal).main 5061 mova m7, [rsp+gprsize+16*0] 5062 SAVE_8ROWS rsp+gprsize+16*11, 16 5063 5064 mova m0, [coeffq+16*8 ] 5065 mova m1, [coeffq+16*24] 5066 mova m2, [coeffq+16*9 ] 5067 mova m3, [coeffq+16*25] 5068 mova [rsp+gprsize+16*19], m0 ;in1 5069 mova [rsp+gprsize+16*26], m1 ;in3 5070 mova [rsp+gprsize+16*23], m2 ;in5 5071 mova [rsp+gprsize+16*22], m3 ;in7 5072 5073 call m(idct_8x32_internal).main_veryfast 5074 SAVE_8ROWS rsp+gprsize+16*3, 16 5075 5076 call .main_fast 5077 5078.end: 5079 LOAD_8ROWS rsp+gprsize+16*3, 16 5080 mova [rsp+gprsize+16*0], m7 5081 mov r3, r4 5082 jmp m(idct_8x32_internal).end2 5083 5084.end1: 5085 LOAD_8ROWS rsp+gprsize+16*35, 16 5086 lea dstq, [dstq+strideq*2] 5087 add rsp, 16*32 5088 lea r3, [o(m(idct_16x64_internal).end2)] 5089 jmp m(idct_8x32_internal).end 5090 5091.end2: 5092 add coeffq, 16*32 5093 sub rsp, 16*32 5094 5095 mov dstq, [rsp+gprsize*2+16*67] 5096 mov r3d, [rsp+gprsize*3+16*67] 5097 lea r4, [dstq+8] 5098 mov [rsp+gprsize*2+16*67], r4 5099 lea r4, [o(m(idct_16x64_internal).end1)] 5100 5101 dec r3d 5102 jg .pass2_loop 5103 ret 5104 5105 5106ALIGN function_align 5107.main_fast: 5108 mova m0, [rsp+gprsize*2+16*35] ;in1 5109 pmulhrsw m3, m0, [o(pw_4095x8)] ;t62,t63 5110 pmulhrsw m0, [o(pw_101x8)] ;t32,t33 5111 mova m7, [o(pd_2048)] 5112 mova [rsp+gprsize*2+16*35], m0 ;t32 5113 mova [rsp+gprsize*2+16*66], m3 ;t63 5114 ITX_MULSUB_2W 3, 0, 1, 2, 7, 401, 4076 ;t33a, t62a 5115 mova [rsp+gprsize*2+16*36], m3 ;t33a 5116 mova [rsp+gprsize*2+16*65], m0 ;t62a 5117 5118 mova m1, [rsp+gprsize*2+16*37] ;in15 5119 pmulhrsw m2, m1, [o(pw_3822x8)] ;t60,t61 5120 pmulhrsw m1, [o(pw_m1474x8)] ;t34,t35 5121 mova [rsp+gprsize*2+16*38], m1 ;t35 5122 mova [rsp+gprsize*2+16*63], m2 ;t60 5123 ITX_MULSUB_2W 2, 1, 0, 3, 7, m4076, 401 ;t34a, t61a 5124 mova [rsp+gprsize*2+16*37], m2 ;t34a 5125 mova [rsp+gprsize*2+16*64], m1 ;t61a 5126 5127 mova m0, [rsp+gprsize*2+16*39] ;in9 5128 pmulhrsw m3, m0, [o(pw_3996x8)] ;t58,t59 5129 pmulhrsw m0, [o(pw_897x8)] ;t36,t37 5130 mova [rsp+gprsize*2+16*39], m0 ;t36 5131 mova [rsp+gprsize*2+16*62], m3 ;t59 5132 ITX_MULSUB_2W 3, 0, 1, 2, 7, 3166, 2598 ;t37a, t58a 5133 mova [rsp+gprsize*2+16*40], m3 ;t37a 5134 mova [rsp+gprsize*2+16*61], m0 ;t58a 5135 5136 mova m1, [rsp+gprsize*2+16*41] ;in7 5137 pmulhrsw m2, m1, [o(pw_4036x8)] ;t56,t57 5138 pmulhrsw m1, [o(pw_m700x8)] ;t38,t39 5139 mova [rsp+gprsize*2+16*42], m1 ;t39 5140 mova [rsp+gprsize*2+16*59], m2 ;t56 5141 ITX_MULSUB_2W 2, 1, 0, 3, 7, m2598, 3166 ;t38a, t57a 5142 mova [rsp+gprsize*2+16*41], m2 ;t38a 5143 mova [rsp+gprsize*2+16*60], m1 ;t57a 5144 5145 mova m0, [rsp+gprsize*2+16*43] ;in5 5146 pmulhrsw m3, m0, [o(pw_4065x8)] ;t54,t55 5147 pmulhrsw m0, [o(pw_501x8)] ;t40,t41 5148 mova [rsp+gprsize*2+16*43], m0 ;t40 5149 mova [rsp+gprsize*2+16*58], m3 ;t55 5150 ITX_MULSUB_2W 3, 0, 1, 2, 7, 1931, 3612 ;t41a, t54a 5151 mova [rsp+gprsize*2+16*44], m3 ;t41a 5152 mova [rsp+gprsize*2+16*57], m0 ;t54a 5153 5154 mova m1, [rsp+gprsize*2+16*45] ;in11 5155 pmulhrsw m2, m1, [o(pw_3948x8)] ;t52,t53 5156 pmulhrsw m1, [o(pw_m1092x8)] ;t42,t43 5157 mova [rsp+gprsize*2+16*46], m1 ;t43 5158 mova [rsp+gprsize*2+16*55], m2 ;t52 5159 ITX_MULSUB_2W 2, 1, 0, 3, 7, m3612, 1931 ;t42a, t53a 5160 mova [rsp+gprsize*2+16*45], m2 ;t42a 5161 mova [rsp+gprsize*2+16*56], m1 ;t53a 5162 5163 mova m0, [rsp+gprsize*2+16*47] ;in13 5164 pmulhrsw m3, m0, [o(pw_3889x8)] ;t50,t51 5165 pmulhrsw m0, [o(pw_1285x8)] ;t44,t45 5166 mova m6, m0 5167 mova [rsp+gprsize*2+16*54], m3 ;t51 5168 ITX_MULSUB_2W 3, 0, 1, 2, 7, 3920, 1189 ;t45a, t50a 5169 mova [rsp+gprsize*2+16*48], m3 ;t45a 5170 mova [rsp+gprsize*2+16*53], m0 ;t50a 5171 5172 mova m0, [rsp+gprsize*2+16*49] ;in3 5173 pmulhrsw m3, m0, [o(pw_4085x8)] ;t48,t49 5174 pmulhrsw m0, [o(pw_m301x8)] ;t46,t47 5175 mova m4, m3 5176 mova m5, m0 5177 5178 jmp .main2 5179 5180ALIGN function_align 5181.main: 5182 mova m0, [rsp+gprsize*2+16*35] ;in1 5183 mova m1, [rsp+gprsize*2+16*65] ;in31 5184 pmulhrsw m3, m0, [o(pw_4095x8)] ;t63a 5185 pmulhrsw m0, [o(pw_101x8)] ;t32a 5186 pmulhrsw m2, m1, [o(pw_2967x8)] ;t62a 5187 pmulhrsw m1, [o(pw_m2824x8)] ;t33a 5188 mova m7, [o(pd_2048)] 5189 psubsw m4, m0, m1 ;t33 5190 paddsw m0, m1 ;t32 5191 psubsw m5, m3, m2 ;t62 5192 paddsw m3, m2 ;t63 5193 ITX_MULSUB_2W 5, 4, 1, 2, 7, 401, 4076 ;t33a, t62a 5194 mova [rsp+gprsize*2+16*35], m0 ;t32 5195 mova [rsp+gprsize*2+16*36], m5 ;t33a 5196 mova [rsp+gprsize*2+16*65], m4 ;t62a 5197 mova [rsp+gprsize*2+16*66], m3 ;t63 5198 5199 mova m0, [rsp+gprsize*2+16*63] ;in17 5200 mova m1, [rsp+gprsize*2+16*37] ;in15 5201 pmulhrsw m3, m0, [o(pw_3745x8)] ;t61a 5202 pmulhrsw m0, [o(pw_1660x8)] ;t34a 5203 pmulhrsw m2, m1, [o(pw_3822x8)] ;t60a 5204 pmulhrsw m1, [o(pw_m1474x8)] ;t35a 5205 psubsw m4, m1, m0 ;t34 5206 paddsw m0, m1 ;t35 5207 psubsw m5, m2, m3 ;t61 5208 paddsw m3, m2 ;t60 5209 ITX_MULSUB_2W 5, 4, 1, 2, 7, m4076, 401 ;t34a, t61a 5210 mova [rsp+gprsize*2+16*37], m5 ;t34a 5211 mova [rsp+gprsize*2+16*38], m0 ;t35 5212 mova [rsp+gprsize*2+16*63], m3 ;t60 5213 mova [rsp+gprsize*2+16*64], m4 ;t61a 5214 5215 mova m0, [rsp+gprsize*2+16*39] ;in9 5216 mova m1, [rsp+gprsize*2+16*61] ;in23 5217 pmulhrsw m3, m0, [o(pw_3996x8)] ;t59a 5218 pmulhrsw m0, [o(pw_897x8)] ;t36a 5219 pmulhrsw m2, m1, [o(pw_3461x8)] ;t58a 5220 pmulhrsw m1, [o(pw_m2191x8)] ;t37a 5221 psubsw m4, m0, m1 ;t37 5222 paddsw m0, m1 ;t36 5223 psubsw m5, m3, m2 ;t58 5224 paddsw m3, m2 ;t59 5225 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3166, 2598 ;t37a, t58a 5226 mova [rsp+gprsize*2+16*39], m0 ;t36 5227 mova [rsp+gprsize*2+16*40], m5 ;t37a 5228 mova [rsp+gprsize*2+16*61], m4 ;t58a 5229 mova [rsp+gprsize*2+16*62], m3 ;t59 5230 5231 mova m0, [rsp+gprsize*2+16*59] ;in25 5232 mova m1, [rsp+gprsize*2+16*41] ;in7 5233 pmulhrsw m3, m0, [o(pw_3349x8)] ;t57a 5234 pmulhrsw m0, [o(pw_2359x8)] ;t38a 5235 pmulhrsw m2, m1, [o(pw_4036x8)] ;t56a 5236 pmulhrsw m1, [o(pw_m700x8)] ;t39a 5237 psubsw m4, m1, m0 ;t38 5238 paddsw m0, m1 ;t39 5239 psubsw m5, m2, m3 ;t57 5240 paddsw m3, m2 ;t56 5241 ITX_MULSUB_2W 5, 4, 1, 2, 7, m2598, 3166 ;t38a, t57a 5242 mova [rsp+gprsize*2+16*41], m5 ;t38a 5243 mova [rsp+gprsize*2+16*42], m0 ;t39 5244 mova [rsp+gprsize*2+16*59], m3 ;t56 5245 mova [rsp+gprsize*2+16*60], m4 ;t57a 5246 5247 mova m0, [rsp+gprsize*2+16*43] ;in5 5248 mova m1, [rsp+gprsize*2+16*57] ;in27 5249 pmulhrsw m3, m0, [o(pw_4065x8)] ;t55a 5250 pmulhrsw m0, [o(pw_501x8)] ;t40a 5251 pmulhrsw m2, m1, [o(pw_3229x8)] ;t54a 5252 pmulhrsw m1, [o(pw_m2520x8)] ;t41a 5253 psubsw m4, m0, m1 ;t41 5254 paddsw m0, m1 ;t40 5255 psubsw m5, m3, m2 ;t54 5256 paddsw m3, m2 ;t55 5257 ITX_MULSUB_2W 5, 4, 1, 2, 7, 1931, 3612 ;t41a, t54a 5258 mova [rsp+gprsize*2+16*43], m0 ;t40 5259 mova [rsp+gprsize*2+16*44], m5 ;t41a 5260 mova [rsp+gprsize*2+16*57], m4 ;t54a 5261 mova [rsp+gprsize*2+16*58], m3 ;t55 5262 5263 mova m0, [rsp+gprsize*2+16*55] ;in21 5264 mova m1, [rsp+gprsize*2+16*45] ;in11 5265 pmulhrsw m3, m0, [o(pw_3564x8)] ;t53a 5266 pmulhrsw m0, [o(pw_2019x8)] ;t42a 5267 pmulhrsw m2, m1, [o(pw_3948x8)] ;t52a 5268 pmulhrsw m1, [o(pw_m1092x8)] ;t43a 5269 psubsw m4, m1, m0 ;t42 5270 paddsw m0, m1 ;t43 5271 psubsw m5, m2, m3 ;t53 5272 paddsw m3, m2 ;t52 5273 ITX_MULSUB_2W 5, 4, 1, 2, 7, m3612, 1931 ;t42a, t53a 5274 mova [rsp+gprsize*2+16*45], m5 ;t42a 5275 mova [rsp+gprsize*2+16*46], m0 ;t43 5276 mova [rsp+gprsize*2+16*55], m3 ;t52 5277 mova [rsp+gprsize*2+16*56], m4 ;t53a 5278 5279 mova m0, [rsp+gprsize*2+16*47] ;in13 5280 mova m1, [rsp+gprsize*2+16*53] ;in19 5281 pmulhrsw m3, m0, [o(pw_3889x8)] ;t51a 5282 pmulhrsw m0, [o(pw_1285x8)] ;t44a 5283 pmulhrsw m2, m1, [o(pw_3659x8)] ;t50a 5284 pmulhrsw m1, [o(pw_m1842x8)] ;t45a 5285 psubsw m4, m0, m1 ;t45 5286 paddsw m0, m1 ;t44 5287 psubsw m5, m3, m2 ;t50 5288 paddsw m3, m2 ;t51 5289 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3920, 1189 ;t45a, t50a 5290 mova m6, m0 5291 mova [rsp+gprsize*2+16*48], m5 ;t45a 5292 mova [rsp+gprsize*2+16*53], m4 ;t50a 5293 mova [rsp+gprsize*2+16*54], m3 ;t51 5294 5295 mova m0, [rsp+gprsize*2+16*51] ;in29 5296 mova m1, [rsp+gprsize*2+16*49] ;in3 5297 pmulhrsw m3, m0, [o(pw_3102x8)] ;t49a 5298 pmulhrsw m0, [o(pw_2675x8)] ;t46a 5299 pmulhrsw m2, m1, [o(pw_4085x8)] ;t48a 5300 pmulhrsw m1, [o(pw_m301x8)] ;t47a 5301 psubsw m5, m1, m0 ;t46 5302 paddsw m0, m1 ;t47 5303 psubsw m4, m2, m3 ;t49 5304 paddsw m3, m2 ;t48 5305 5306ALIGN function_align 5307.main2: 5308 ITX_MULSUB_2W 4, 5, 1, 2, 7, m1189, 3920 ;t46a, t49a 5309 mova m1, [rsp+gprsize*2+16*54] ;t51 5310 psubsw m2, m0, m6 ;t44a 5311 paddsw m0, m6 ;t47a 5312 psubsw m6, m3, m1 ;t51a 5313 paddsw m3, m1 ;t48a 5314 mova [rsp+gprsize*2+16*50], m0 ;t47a 5315 mova [rsp+gprsize*2+16*51], m3 ;t48a 5316 ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t44, t51 5317 mova [rsp+gprsize*2+16*47], m6 ;t44 5318 mova [rsp+gprsize*2+16*54], m2 ;t51 5319 5320 mova m0, [rsp+gprsize*2+16*48] ;t45a 5321 mova m3, [rsp+gprsize*2+16*53] ;t50a 5322 psubsw m2, m4, m0 ;t45 5323 paddsw m4, m0 ;t46 5324 psubsw m6, m5, m3 ;t50 5325 paddsw m5, m3 ;t49 5326 ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t45a, t50a 5327 mova [rsp+gprsize*2+16*48], m6 ;t45a 5328 mova [rsp+gprsize*2+16*49], m4 ;t46 5329 mova [rsp+gprsize*2+16*52], m5 ;t49 5330 mova [rsp+gprsize*2+16*53], m2 ;t50a 5331 5332 mova m0, [rsp+gprsize*2+16*43] ;t40 5333 mova m2, [rsp+gprsize*2+16*46] ;t43 5334 mova m3, [rsp+gprsize*2+16*55] ;t52 5335 mova m1, [rsp+gprsize*2+16*58] ;t55 5336 psubsw m4, m0, m2 ;t43a 5337 paddsw m0, m2 ;t40a 5338 psubsw m5, m1, m3 ;t52a 5339 paddsw m1, m3 ;t55a 5340 ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t43, t52 5341 mova [rsp+gprsize*2+16*43], m0 ;t40a 5342 mova [rsp+gprsize*2+16*46], m5 ;t43 5343 mova [rsp+gprsize*2+16*55], m4 ;t52 5344 mova [rsp+gprsize*2+16*58], m1 ;t55a 5345 5346 mova m0, [rsp+gprsize*2+16*44] ;t41a 5347 mova m2, [rsp+gprsize*2+16*45] ;t42a 5348 mova m3, [rsp+gprsize*2+16*56] ;t53a 5349 mova m1, [rsp+gprsize*2+16*57] ;t54a 5350 psubsw m4, m0, m2 ;t42 5351 paddsw m0, m2 ;t41 5352 psubsw m5, m1, m3 ;t53 5353 paddsw m1, m3 ;t54 5354 ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t42a, t53a 5355 mova [rsp+gprsize*2+16*44], m0 ;t41 5356 mova [rsp+gprsize*2+16*45], m5 ;t42a 5357 mova [rsp+gprsize*2+16*56], m4 ;t53a 5358 mova [rsp+gprsize*2+16*57], m1 ;t54 5359 5360 mova m0, [rsp+gprsize*2+16*41] ;t38a 5361 mova m2, [rsp+gprsize*2+16*40] ;t37a 5362 mova m3, [rsp+gprsize*2+16*61] ;t58a 5363 mova m1, [rsp+gprsize*2+16*60] ;t57a 5364 psubsw m4, m0, m2 ;t37 5365 paddsw m0, m2 ;t38 5366 psubsw m5, m1, m3 ;t58 5367 paddsw m1, m3 ;t57 5368 ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t37a, t58a 5369 mova [rsp+gprsize*2+16*41], m0 ;t38 5370 mova [rsp+gprsize*2+16*40], m5 ;t37a 5371 mova [rsp+gprsize*2+16*61], m4 ;t58a 5372 mova [rsp+gprsize*2+16*60], m1 ;t57 5373 5374 mova m0, [rsp+gprsize*2+16*42] ;t39 5375 mova m2, [rsp+gprsize*2+16*39] ;t36 5376 mova m3, [rsp+gprsize*2+16*62] ;t59 5377 mova m1, [rsp+gprsize*2+16*59] ;t56 5378 psubsw m4, m0, m2 ;t36a 5379 paddsw m0, m2 ;t39a 5380 psubsw m5, m1, m3 ;t59a 5381 paddsw m1, m3 ;t56a 5382 ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t36, t59 5383 mova [rsp+gprsize*2+16*42], m0 ;t39a 5384 mova [rsp+gprsize*2+16*39], m5 ;t36 5385 mova [rsp+gprsize*2+16*62], m4 ;t59 5386 mova [rsp+gprsize*2+16*59], m1 ;t56a 5387 5388 mova m0, [rsp+gprsize*2+16*35] ;t32 5389 mova m2, [rsp+gprsize*2+16*38] ;t35 5390 mova m3, [rsp+gprsize*2+16*63] ;t60 5391 mova m1, [rsp+gprsize*2+16*66] ;t63 5392 psubsw m4, m0, m2 ;t35a 5393 paddsw m0, m2 ;t32a 5394 psubsw m5, m1, m3 ;t60a 5395 paddsw m1, m3 ;t63a 5396 ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t35, t60 5397 mova [rsp+gprsize*2+16*35], m0 ;t32a 5398 mova [rsp+gprsize*2+16*38], m5 ;t35 5399 mova [rsp+gprsize*2+16*63], m4 ;t60 5400 mova [rsp+gprsize*2+16*66], m1 ;t63a 5401 5402 mova m0, [rsp+gprsize*2+16*36] ;t33a 5403 mova m2, [rsp+gprsize*2+16*37] ;t34a 5404 mova m3, [rsp+gprsize*2+16*64] ;t61a 5405 mova m1, [rsp+gprsize*2+16*65] ;t62a 5406 psubsw m4, m0, m2 ;t34 5407 paddsw m0, m2 ;t33 5408 psubsw m5, m1, m3 ;t61 5409 paddsw m1, m3 ;t62 5410 ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t34a, t61a 5411 5412 mova m2, [rsp+gprsize*2+16*41] ;t38 5413 mova m3, [rsp+gprsize*2+16*60] ;t57 5414 psubsw m6, m0, m2 ;t38a 5415 paddsw m0, m2 ;t33a 5416 psubsw m2, m1, m3 ;t57a 5417 paddsw m1, m3 ;t62a 5418 mova [rsp+gprsize*2+16*36], m0 ;t33a 5419 mova [rsp+gprsize*2+16*65], m1 ;t62a 5420 ITX_MULSUB_2W 2, 6, 0, 3, 7, 1567, 3784 ;t38, t57 5421 mova [rsp+gprsize*2+16*41], m2 ;t38 5422 mova [rsp+gprsize*2+16*60], m6 ;t57 5423 5424 mova m2, [rsp+gprsize*2+16*40] ;t37 5425 mova m3, [rsp+gprsize*2+16*61] ;t58 5426 psubsw m0, m5, m2 ;t37 5427 paddsw m5, m2 ;t34 5428 psubsw m1, m4, m3 ;t58 5429 paddsw m4, m3 ;t61 5430 ITX_MULSUB_2W 1, 0, 2, 3, 7, 1567, 3784 ;t37a, t58a 5431 mova [rsp+gprsize*2+16*37], m5 ;t34 5432 mova [rsp+gprsize*2+16*64], m4 ;t61 5433 mova [rsp+gprsize*2+16*40], m1 ;t37a 5434 mova [rsp+gprsize*2+16*61], m0 ;t58a 5435 5436 mova m0, [rsp+gprsize*2+16*38] ;t35 5437 mova m2, [rsp+gprsize*2+16*39] ;t36 5438 mova m3, [rsp+gprsize*2+16*62] ;t59 5439 mova m1, [rsp+gprsize*2+16*63] ;t60 5440 psubsw m4, m0, m2 ;t36a 5441 paddsw m0, m2 ;t35a 5442 psubsw m5, m1, m3 ;t59a 5443 paddsw m1, m3 ;t60a 5444 ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t36, t59 5445 mova [rsp+gprsize*2+16*38], m0 ;t35a 5446 mova [rsp+gprsize*2+16*39], m5 ;t36 5447 mova [rsp+gprsize*2+16*62], m4 ;t59 5448 mova [rsp+gprsize*2+16*63], m1 ;t60a 5449 5450 mova m0, [rsp+gprsize*2+16*35] ;t32a 5451 mova m2, [rsp+gprsize*2+16*42] ;t39a 5452 mova m3, [rsp+gprsize*2+16*59] ;t56a 5453 mova m1, [rsp+gprsize*2+16*66] ;t63a 5454 psubsw m4, m0, m2 ;t39 5455 paddsw m0, m2 ;t32 5456 psubsw m5, m1, m3 ;t56 5457 paddsw m1, m3 ;t63 5458 ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t39a, t56a 5459 mova [rsp+gprsize*2+16*35], m0 ;t32 5460 mova [rsp+gprsize*2+16*42], m5 ;t39a 5461 mova [rsp+gprsize*2+16*59], m4 ;t56a 5462 mova [rsp+gprsize*2+16*66], m1 ;t63 5463 5464 mova m0, [rsp+gprsize*2+16*50] ;t47a 5465 mova m2, [rsp+gprsize*2+16*43] ;t40a 5466 mova m3, [rsp+gprsize*2+16*58] ;t55a 5467 mova m1, [rsp+gprsize*2+16*51] ;t48a 5468 psubsw m4, m0, m2 ;t40 5469 paddsw m0, m2 ;t47 5470 psubsw m5, m1, m3 ;t55 5471 paddsw m1, m3 ;t48 5472 ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t40a, t55a 5473 mova [rsp+gprsize*2+16*50], m0 ;t47 5474 mova [rsp+gprsize*2+16*43], m5 ;t40a 5475 mova [rsp+gprsize*2+16*58], m4 ;t55a 5476 mova [rsp+gprsize*2+16*51], m1 ;t48 5477 5478 mova m0, [rsp+gprsize*2+16*49] ;t46 5479 mova m2, [rsp+gprsize*2+16*44] ;t41 5480 mova m3, [rsp+gprsize*2+16*57] ;t54 5481 mova m1, [rsp+gprsize*2+16*52] ;t49 5482 psubsw m4, m0, m2 ;t41a 5483 paddsw m0, m2 ;t46a 5484 psubsw m5, m1, m3 ;t54a 5485 paddsw m1, m3 ;t49a 5486 ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t41, t54 5487 mova [rsp+gprsize*2+16*49], m0 ;t46a 5488 mova [rsp+gprsize*2+16*44], m5 ;t41 5489 mova [rsp+gprsize*2+16*57], m4 ;t54 5490 mova [rsp+gprsize*2+16*52], m1 ;t49a 5491 5492 mova m0, [rsp+gprsize*2+16*48] ;t45a 5493 mova m2, [rsp+gprsize*2+16*45] ;t42a 5494 mova m3, [rsp+gprsize*2+16*56] ;t53a 5495 mova m1, [rsp+gprsize*2+16*53] ;t50a 5496 psubsw m4, m0, m2 ;t42 5497 paddsw m0, m2 ;t45 5498 psubsw m5, m1, m3 ;t53 5499 paddsw m1, m3 ;t50 5500 ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t42a, t53a 5501 mova [rsp+gprsize*2+16*48], m0 ;t45 5502 mova [rsp+gprsize*2+16*45], m5 ;t42a 5503 mova [rsp+gprsize*2+16*56], m4 ;t53a 5504 mova [rsp+gprsize*2+16*53], m1 ;t50 5505 5506 mova m0, [rsp+gprsize*2+16*47] ;t44 5507 mova m2, [rsp+gprsize*2+16*46] ;t43 5508 mova m3, [rsp+gprsize*2+16*55] ;t52 5509 mova m1, [rsp+gprsize*2+16*54] ;t51 5510 psubsw m4, m0, m2 ;t43a 5511 paddsw m0, m2 ;t44a 5512 psubsw m5, m1, m3 ;t52a 5513 paddsw m1, m3 ;t51a 5514 ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t43, t52 5515 5516 mova m2, [rsp+gprsize*2+16*38] ;t35a 5517 mova m3, [rsp+gprsize*2+16*31] ;tmp[28] 5518 psubsw m6, m2, m0 ;t44 5519 paddsw m2, m0 ;t35 5520 psubsw m0, m3, m2 ;out35 5521 paddsw m2, m3 ;out28 5522 mova m3, [rsp+gprsize*2+16*63] ;t60a 5523 mova [rsp+gprsize*2+16*38], m0 ;out35 5524 mova [rsp+gprsize*2+16*31], m2 ;out28 5525 psubsw m0, m3, m1 ;t51 5526 paddsw m3, m1 ;t60 5527 ITX_MULSUB_2W 0, 6, 1, 2, 7, 2896, 2896 ;t44a, t51a 5528 mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3] 5529 psubsw m1, m2, m3 ;out60 5530 paddsw m2, m3 ;out3 5531 mova m3, [rsp+gprsize*2+16*22] ;tmp[19] 5532 mova [rsp+gprsize*2+16*63], m1 ;out60 5533 mova [rsp+gprsize*2+16*6 ], m2 ;out3 5534 psubsw m1, m3, m0 ;out44 5535 paddsw m3, m0 ;out19 5536 mova m2, [rsp+gprsize*2+16*15] ;tmp[12] 5537 5538 mova m0, [rsp+gprsize*2+16*39] ;t36 5539 mova [rsp+gprsize*2+16*47], m1 ;out44 5540 mova [rsp+gprsize*2+16*22], m3 ;out19 5541 mova m1, [rsp+gprsize*2+16*62] ;t59 5542 psubsw m3, m2, m6 ;out51 5543 paddsw m2, m6 ;out12 5544 mova [rsp+gprsize*2+16*54], m3 ;out51 5545 mova [rsp+gprsize*2+16*15], m2 ;out12 5546 psubsw m2, m0, m5 ;t43a 5547 paddsw m0, m5 ;t36a 5548 mova m5, [rsp+gprsize*2+16*30] ;tmp[27] 5549 psubsw m3, m1, m4 ;t52a 5550 paddsw m1, m4 ;t59a 5551 ITX_MULSUB_2W 3, 2, 4, 6, 7, 2896, 2896 ;t43, t52 5552 mova m4, [rsp+gprsize*2+16*7 ] ;tmp[4 ] 5553 psubsw m6, m5, m0 ;out36 5554 paddsw m5, m0 ;out27 5555 psubsw m0, m4, m1 ;out59 5556 paddsw m4, m1 ;out4 5557 mova [rsp+gprsize*2+16*39], m6 ;out36 5558 mova [rsp+gprsize*2+16*30], m5 ;out27 5559 mova [rsp+gprsize*2+16*62], m0 ;out59 5560 mova [rsp+gprsize*2+16*7 ], m4 ;out4 5561 mova m0, [rsp+gprsize*2+16*23] ;tmp[20] 5562 mova m5, [rsp+gprsize*2+16*14] ;tmp[11] 5563 psubsw m4, m0, m3 ;out43 5564 paddsw m0, m3 ;out20 5565 psubsw m6, m5, m2 ;out52 5566 paddsw m5, m2 ;out11 5567 mova [rsp+gprsize*2+16*46], m4 ;out43 5568 mova [rsp+gprsize*2+16*23], m0 ;out20 5569 mova [rsp+gprsize*2+16*55], m6 ;out52 5570 mova [rsp+gprsize*2+16*14], m5 ;out11 5571 5572 mova m0, [rsp+gprsize*2+16*40] ;t37a 5573 mova m5, [rsp+gprsize*2+16*45] ;t42a 5574 mova m3, [rsp+gprsize*2+16*56] ;t53a 5575 mova m1, [rsp+gprsize*2+16*61] ;t58a 5576 mova m2, [rsp+gprsize*2+16*29] ;tmp[26] 5577 psubsw m4, m0, m5 ;t42 5578 paddsw m0, m5 ;t37 5579 psubsw m5, m1, m3 ;t53 5580 paddsw m1, m3 ;t58 5581 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t43, t52 5582 mova m3, [rsp+gprsize*2+16*8 ] ;tmp[5 ] 5583 psubsw m6, m2, m0 ;out37 5584 paddsw m2, m0 ;out26 5585 psubsw m0, m3, m1 ;out58 5586 paddsw m3, m1 ;out5 5587 mova [rsp+gprsize*2+16*40], m6 ;out37 5588 mova [rsp+gprsize*2+16*29], m2 ;out26 5589 mova [rsp+gprsize*2+16*61], m0 ;out58 5590 mova [rsp+gprsize*2+16*8 ], m3 ;out5 5591 mova m0, [rsp+gprsize*2+16*24] ;tmp[21] 5592 mova m1, [rsp+gprsize*2+16*13] ;tmp[10] 5593 psubsw m2, m0, m5 ;out42 5594 paddsw m0, m5 ;out21 5595 psubsw m3, m1, m4 ;out53 5596 paddsw m1, m4 ;out10 5597 mova [rsp+gprsize*2+16*45], m2 ;out42 5598 mova [rsp+gprsize*2+16*24], m0 ;out21 5599 mova [rsp+gprsize*2+16*56], m3 ;out53 5600 mova [rsp+gprsize*2+16*13], m1 ;out10 5601 5602 mova m0, [rsp+gprsize*2+16*41] ;t38 5603 mova m5, [rsp+gprsize*2+16*44] ;t41 5604 mova m3, [rsp+gprsize*2+16*57] ;t54 5605 mova m1, [rsp+gprsize*2+16*60] ;t57 5606 mova m2, [rsp+gprsize*2+16*28] ;tmp[25] 5607 psubsw m4, m0, m5 ;t41a 5608 paddsw m0, m5 ;t38a 5609 psubsw m5, m1, m3 ;t54a 5610 paddsw m1, m3 ;t57a 5611 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t41a, t54a 5612 mova m3, [rsp+gprsize*2+16*9 ] ;tmp[6 ] 5613 psubsw m6, m2, m0 ;out38 5614 paddsw m2, m0 ;out25 5615 psubsw m0, m3, m1 ;out57 5616 paddsw m3, m1 ;out6 5617 mova [rsp+gprsize*2+16*41], m6 ;out38 5618 mova [rsp+gprsize*2+16*28], m2 ;out25 5619 mova [rsp+gprsize*2+16*60], m0 ;out57 5620 mova [rsp+gprsize*2+16*9 ], m3 ;out6 5621 mova m0, [rsp+gprsize*2+16*25] ;tmp[22] 5622 mova m1, [rsp+gprsize*2+16*12] ;tmp[9 ] 5623 psubsw m2, m0, m5 ;out41 5624 paddsw m0, m5 ;out22 5625 psubsw m3, m1, m4 ;out54 5626 paddsw m1, m4 ;out9 5627 mova [rsp+gprsize*2+16*44], m2 ;out41 5628 mova [rsp+gprsize*2+16*25], m0 ;out22 5629 mova [rsp+gprsize*2+16*57], m3 ;out54 5630 mova [rsp+gprsize*2+16*12], m1 ;out9 5631 5632 mova m0, [rsp+gprsize*2+16*42] ;t39a 5633 mova m5, [rsp+gprsize*2+16*43] ;t40a 5634 mova m3, [rsp+gprsize*2+16*58] ;t55a 5635 mova m1, [rsp+gprsize*2+16*59] ;t56a 5636 mova m2, [rsp+gprsize*2+16*27] ;tmp[24] 5637 psubsw m4, m0, m5 ;t40 5638 paddsw m0, m5 ;t39 5639 psubsw m5, m1, m3 ;t55 5640 paddsw m1, m3 ;t56 5641 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t40a, t55a 5642 mova m3, [rsp+gprsize*2+16*10] ;tmp[7 ] 5643 psubsw m6, m2, m0 ;out39 5644 paddsw m2, m0 ;out24 5645 psubsw m0, m3, m1 ;out56 5646 paddsw m3, m1 ;out7 5647 mova [rsp+gprsize*2+16*42], m6 ;out39 5648 mova [rsp+gprsize*2+16*27], m2 ;out24 5649 mova [rsp+gprsize*2+16*59], m0 ;out56 5650 mova [rsp+gprsize*2+16*10], m3 ;out7 5651 mova m0, [rsp+gprsize*2+16*26] ;tmp[23] 5652 mova m1, [rsp+gprsize*2+16*11] ;tmp[8 ] 5653 psubsw m2, m0, m5 ;out40 5654 paddsw m0, m5 ;out23 5655 psubsw m3, m1, m4 ;out55 5656 paddsw m1, m4 ;out8 5657 mova [rsp+gprsize*2+16*43], m2 ;out40 5658 mova [rsp+gprsize*2+16*26], m0 ;out23 5659 mova [rsp+gprsize*2+16*58], m3 ;out55 5660 mova [rsp+gprsize*2+16*11], m1 ;out8 5661 5662 mova m0, [rsp+gprsize*2+16*37] ;t34 5663 mova m5, [rsp+gprsize*2+16*48] ;t45 5664 mova m3, [rsp+gprsize*2+16*53] ;t50 5665 mova m1, [rsp+gprsize*2+16*64] ;t61 5666 mova m2, [rsp+gprsize*2+16*32] ;tmp[29] 5667 psubsw m4, m0, m5 ;t45a 5668 paddsw m0, m5 ;t34a 5669 psubsw m5, m1, m3 ;t50a 5670 paddsw m1, m3 ;t61a 5671 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 5672 mova m3, [rsp+gprsize*2+16*5 ] ;tmp[2 ] 5673 psubsw m6, m2, m0 ;out34 5674 paddsw m2, m0 ;out29 5675 psubsw m0, m3, m1 ;out61 5676 paddsw m3, m1 ;out2 5677 mova [rsp+gprsize*2+16*37], m6 ;out34 5678 mova [rsp+gprsize*2+16*32], m2 ;out29 5679 mova [rsp+gprsize*2+16*64], m0 ;out61 5680 mova [rsp+gprsize*2+16*5 ], m3 ;out2 5681 mova m0, [rsp+gprsize*2+16*21] ;tmp[18] 5682 mova m1, [rsp+gprsize*2+16*16] ;tmp[13] 5683 psubsw m2, m0, m5 ;out45 5684 paddsw m0, m5 ;out18 5685 psubsw m3, m1, m4 ;out50 5686 paddsw m1, m4 ;out13 5687 mova [rsp+gprsize*2+16*48], m2 ;out45 5688 mova [rsp+gprsize*2+16*21], m0 ;out18 5689 mova [rsp+gprsize*2+16*53], m3 ;out50 5690 mova [rsp+gprsize*2+16*16], m1 ;out13 5691 5692 mova m0, [rsp+gprsize*2+16*36] ;t33a 5693 mova m5, [rsp+gprsize*2+16*49] ;t46a 5694 mova m3, [rsp+gprsize*2+16*52] ;t49a 5695 mova m1, [rsp+gprsize*2+16*65] ;t62a 5696 mova m2, [rsp+gprsize*2+16*33] ;tmp[30] 5697 psubsw m4, m0, m5 ;t46 5698 paddsw m0, m5 ;t33 5699 psubsw m5, m1, m3 ;t49 5700 paddsw m1, m3 ;t62 5701 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 5702 mova m3, [rsp+gprsize*2+16*4 ] ;tmp[1 ] 5703 psubsw m6, m2, m0 ;out33 5704 paddsw m2, m0 ;out30 5705 psubsw m0, m3, m1 ;out62 5706 paddsw m3, m1 ;out1 5707 mova [rsp+gprsize*2+16*36], m6 ;out33 5708 mova [rsp+gprsize*2+16*33], m2 ;out30 5709 mova [rsp+gprsize*2+16*65], m0 ;out62 5710 mova [rsp+gprsize*2+16*4 ], m3 ;out1 5711 mova m0, [rsp+gprsize*2+16*20] ;tmp[17] 5712 mova m1, [rsp+gprsize*2+16*17] ;tmp[14] 5713 psubsw m2, m0, m5 ;out46 5714 paddsw m0, m5 ;out17 5715 psubsw m3, m1, m4 ;out49 5716 paddsw m1, m4 ;out14 5717 mova [rsp+gprsize*2+16*49], m2 ;out46 5718 mova [rsp+gprsize*2+16*20], m0 ;out17 5719 mova [rsp+gprsize*2+16*52], m3 ;out49 5720 mova [rsp+gprsize*2+16*17], m1 ;out14 5721 5722 mova m0, [rsp+gprsize*2+16*35] ;t32 5723 mova m5, [rsp+gprsize*2+16*50] ;t47 5724 mova m3, [rsp+gprsize*2+16*51] ;t48 5725 mova m1, [rsp+gprsize*2+16*66] ;t63 5726 mova m2, [rsp+gprsize*2+16*34] ;tmp[31] 5727 psubsw m4, m0, m5 ;t47a 5728 paddsw m0, m5 ;t32a 5729 psubsw m5, m1, m3 ;t48a 5730 paddsw m1, m3 ;t63a 5731 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t47, t48 5732 mova m3, [rsp+gprsize*2+16*3 ] ;tmp[0 ] 5733 psubsw m6, m2, m0 ;out32 5734 paddsw m2, m0 ;out31 5735 psubsw m0, m3, m1 ;out63 5736 paddsw m3, m1 ;out0 5737 mova [rsp+gprsize*2+16*35], m6 ;out32 5738 mova [rsp+gprsize*2+16*34], m2 ;out31 5739 mova [rsp+gprsize*2+16*66], m0 ;out63 5740 mova [rsp+gprsize*2+16*3 ], m3 ;out0 5741 mova m0, [rsp+gprsize*2+16*19] ;tmp[16] 5742 mova m1, [rsp+gprsize*2+16*18] ;tmp[15] 5743 psubsw m2, m0, m5 ;out47 5744 paddsw m0, m5 ;out16 5745 psubsw m3, m1, m4 ;out48 5746 paddsw m1, m4 ;out15 5747 mova [rsp+gprsize*2+16*50], m2 ;out47 5748 mova [rsp+gprsize*2+16*19], m0 ;out16 5749 mova [rsp+gprsize*2+16*51], m3 ;out48 5750 mova [rsp+gprsize*2+16*18], m1 ;out15 5751 ret 5752 5753 5754cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2 5755%if ARCH_X86_32 5756 LEA r5, $$ 5757%endif 5758 test eobd, eobd 5759 jz .dconly 5760 5761 call m(idct_64x16_internal) 5762 RET 5763 5764.dconly: 5765 movd m1, [o(pw_2896x8)] 5766 pmulhrsw m0, m1, [coeffq] 5767 movd m2, [o(pw_8192)] 5768 mov [coeffq], eobd 5769 mov r3d, 16 5770 lea tx2q, [o(m(inv_txfm_add_dct_dct_64x16).end)] 5771 5772.body: 5773 pmulhrsw m0, m2 5774 movd m2, [o(pw_2048)] ;intentionally rip-relative 5775 pmulhrsw m0, m1 5776 pmulhrsw m0, m2 5777 pshuflw m0, m0, q0000 5778 punpcklwd m0, m0 5779 pxor m7, m7 5780 5781.loop: 5782 mova m1, [dstq+16*0] 5783 mova m3, [dstq+16*1] 5784 mova m5, [dstq+16*2] 5785 mova m6, [dstq+16*3] 5786 punpckhbw m2, m1, m7 5787 punpcklbw m1, m7 5788 punpckhbw m4, m3, m7 5789 punpcklbw m3, m7 5790 paddw m2, m0 5791 paddw m1, m0 5792 paddw m4, m0 5793 paddw m3, m0 5794 packuswb m1, m2 5795 packuswb m3, m4 5796 punpckhbw m2, m5, m7 5797 punpcklbw m5, m7 5798 punpckhbw m4, m6, m7 5799 punpcklbw m6, m7 5800 paddw m2, m0 5801 paddw m5, m0 5802 paddw m4, m0 5803 paddw m6, m0 5804 packuswb m5, m2 5805 packuswb m6, m4 5806 mova [dstq+16*0], m1 5807 mova [dstq+16*1], m3 5808 mova [dstq+16*2], m5 5809 mova [dstq+16*3], m6 5810 add dstq, strideq 5811 dec r3d 5812 jg .loop 5813 jmp tx2q 5814 5815.end: 5816 RET 5817 5818 5819%macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2 5820 5821%if %3 5822 mova m3, [o(pw_2896x8)] 5823 pmulhrsw m0, m3, [%1+%2*0] 5824 pmulhrsw m1, m3, [%1+%2*1] 5825 pmulhrsw m2, m3, [%1+%2*2] 5826 pmulhrsw m3, [%1+%2*3] 5827%else 5828 mova m0, [%1+%2*0] 5829 mova m1, [%1+%2*1] 5830 mova m2, [%1+%2*2] 5831 mova m3, [%1+%2*3] 5832%endif 5833%endmacro 5834 5835%macro LOAD_4ROWS_H 2 ;src, stride 5836 mova m4, [%1+%2*0] 5837 mova m5, [%1+%2*1] 5838 mova m6, [%1+%2*2] 5839 mova m7, [%1+%2*3] 5840%endmacro 5841 5842cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 5843 mov r3d, 2 5844 mov [rsp+gprsize*2+16*67], dstq 5845 lea dstq, [rsp+gprsize+16*68] 5846 5847.pass1_loop: 5848 LOAD_4ROWS coeffq+32*0, 32*8 5849 pxor m4, m4 5850 REPX {mova x, m4}, m5, m6, m7 5851 call m(idct_8x8_internal).main 5852 SAVE_7ROWS rsp+gprsize+16*3, 16 5853 5854 pxor m4, m4 5855 LOAD_4ROWS coeffq+32*4, 32*8 5856 5857 REPX {mova x, m4}, m5, m6, m7 5858 call m(idct_16x8_internal).main 5859 mova m7, [rsp+gprsize+16*0] 5860 SAVE_8ROWS rsp+gprsize+16*11, 16 5861 5862 LOAD_8ROWS coeffq+32*2, 32*4 5863 mova [rsp+gprsize+16*19], m0 5864 mova [rsp+gprsize+16*26], m1 5865 mova [rsp+gprsize+16*23], m2 5866 mova [rsp+gprsize+16*22], m3 5867 mova [rsp+gprsize+16*21], m4 5868 mova [rsp+gprsize+16*24], m5 5869 mova [rsp+gprsize+16*25], m6 5870 mova [rsp+gprsize+16*20], m7 5871 5872 call m(idct_8x32_internal).main_fast 5873 SAVE_8ROWS rsp+gprsize+16*3, 16 5874 5875 LOAD_8ROWS coeffq+32*1, 32*2 5876 mova [rsp+gprsize+16*35], m0 ;in1 5877 mova [rsp+gprsize+16*49], m1 ;in3 5878 mova [rsp+gprsize+16*43], m2 ;in5 5879 mova [rsp+gprsize+16*41], m3 ;in7 5880 mova [rsp+gprsize+16*39], m4 ;in9 5881 mova [rsp+gprsize+16*45], m5 ;in11 5882 mova [rsp+gprsize+16*47], m6 ;in13 5883 mova [rsp+gprsize+16*37], m7 ;in15 5884 5885 LOAD_8ROWS coeffq+32*17, 32*2 5886 mova [rsp+gprsize+16*63], m0 ;in17 5887 mova [rsp+gprsize+16*53], m1 ;in19 5888 mova [rsp+gprsize+16*55], m2 ;in21 5889 mova [rsp+gprsize+16*61], m3 ;in23 5890 mova [rsp+gprsize+16*59], m4 ;in25 5891 mova [rsp+gprsize+16*57], m5 ;in27 5892 mova [rsp+gprsize+16*51], m6 ;in29 5893 mova [rsp+gprsize+16*65], m7 ;in31 5894 5895 call m(idct_16x64_internal).main 5896 5897 LOAD_8ROWS rsp+gprsize+16*3, 16 5898 mova [rsp+gprsize+16*0], m7 5899 mova m7, [o(pw_8192)] 5900 lea tx2q, [o(m(idct_64x16_internal).pass1_end)] 5901 jmp m(idct_8x8_internal).pass1_end1 5902 5903.pass1_end: 5904 SAVE_8ROWS coeffq+32*0, 32 5905 LOAD_8ROWS rsp+gprsize+16*11, 16 5906 mova [rsp+gprsize+16*0], m7 5907 mova m7, [o(pw_8192)] 5908 lea tx2q, [o(m(idct_64x16_internal).pass1_end1)] 5909 jmp m(idct_8x8_internal).pass1_end1 5910 5911.pass1_end1: 5912 SAVE_8ROWS coeffq+32*8, 32 5913 LOAD_8ROWS rsp+gprsize+16*19, 16 5914 mova [rsp+gprsize+16*0], m7 5915 mova m7, [o(pw_8192)] 5916 lea tx2q, [o(m(idct_64x16_internal).pass1_end2)] 5917 jmp m(idct_8x8_internal).pass1_end1 5918 5919.pass1_end2: 5920 SAVE_8ROWS coeffq+32*16, 32 5921 LOAD_8ROWS rsp+gprsize+16*27, 16 5922 mova [rsp+gprsize+16*0], m7 5923 mova m7, [o(pw_8192)] 5924 lea tx2q, [o(m(idct_64x16_internal).pass1_end3)] 5925 jmp m(idct_8x8_internal).pass1_end1 5926 5927.pass1_end3: 5928 SAVE_8ROWS coeffq+32*24, 32 5929 LOAD_8ROWS rsp+gprsize+16*35, 16 5930 mova [rsp+gprsize+16*0], m7 5931 mova m7, [o(pw_8192)] 5932 lea tx2q, [o(m(idct_64x16_internal).pass1_end4)] 5933 jmp m(idct_8x8_internal).pass1_end1 5934 5935.pass1_end4: 5936 SAVE_8ROWS dstq+32*0, 32 5937 LOAD_8ROWS rsp+gprsize+16*43, 16 5938 mova [rsp+gprsize+16*0], m7 5939 mova m7, [o(pw_8192)] 5940 lea tx2q, [o(m(idct_64x16_internal).pass1_end5)] 5941 jmp m(idct_8x8_internal).pass1_end1 5942 5943.pass1_end5: 5944 SAVE_8ROWS dstq+32*8, 32 5945 LOAD_8ROWS rsp+gprsize+16*51, 16 5946 mova [rsp+gprsize+16*0], m7 5947 mova m7, [o(pw_8192)] 5948 lea tx2q, [o(m(idct_64x16_internal).pass1_end6)] 5949 jmp m(idct_8x8_internal).pass1_end1 5950 5951.pass1_end6: 5952 SAVE_8ROWS dstq+32*16, 32 5953 LOAD_8ROWS rsp+gprsize+16*59, 16 5954 mova [rsp+gprsize+16*0], m7 5955 mova m7, [o(pw_8192)] 5956 lea tx2q, [o(m(idct_64x16_internal).pass1_end7)] 5957 jmp m(idct_8x8_internal).pass1_end1 5958 5959.pass1_end7: 5960 SAVE_8ROWS dstq+32*24, 32 5961 5962 add coeffq, 16 5963 add dstq, 16 5964 dec r3d 5965 jg .pass1_loop 5966 5967.pass2: 5968 mov dstq, [rsp+gprsize*2+16*67] 5969 sub coeffq, 32 5970 mov r3d, 4 5971 5972.pass2_loop: 5973 mov [rsp+gprsize*1+16*67], r3d 5974 5975 LOAD_4ROWS coeffq+16*0, 32*2 5976 LOAD_4ROWS_H coeffq+16*1, 32*2 5977 call m(idct_8x8_internal).main 5978 SAVE_7ROWS rsp+gprsize+16*3, 16 5979 LOAD_4ROWS coeffq+16*2, 32*2 5980 LOAD_4ROWS_H coeffq+16*3, 32*2 5981 call m(idct_16x8_internal).main 5982 5983 mov r3, dstq 5984 lea tx2q, [o(m(idct_64x16_internal).end)] 5985 lea dstq, [dstq+strideq*8] 5986 jmp m(idct_8x8_internal).end 5987 5988.end: 5989 LOAD_8ROWS rsp+gprsize+16*3, 16 5990 mova [rsp+gprsize+16*0], m7 5991 lea tx2q, [o(m(idct_64x16_internal).end1)] 5992 mov dstq, r3 5993 jmp m(idct_8x8_internal).end 5994 5995.end1: 5996 pxor m7, m7 5997 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 5998 5999 add coeffq, 16*16 6000 mov r3d, [rsp+gprsize*1+16*67] 6001 mov dstq, [rsp+gprsize*2+16*67] 6002 add dstq, 8 6003 mov [rsp+gprsize*2+16*67], dstq 6004 dec r3d 6005 jg .pass2_loop 6006 6007 mov r3d, 4 6008 lea coeffq, [rsp+gprsize+16*68] 6009.pass2_loop2: 6010 mov [rsp+gprsize*1+16*67], r3d 6011 6012 LOAD_4ROWS coeffq+16*0, 32*2 6013 LOAD_4ROWS_H coeffq+16*1, 32*2 6014 call m(idct_8x8_internal).main 6015 SAVE_7ROWS rsp+gprsize+16*3, 16 6016 LOAD_4ROWS coeffq+16*2, 32*2 6017 LOAD_4ROWS_H coeffq+16*3, 32*2 6018 call m(idct_16x8_internal).main 6019 6020 mov r3, dstq 6021 lea tx2q, [o(m(idct_64x16_internal).end2)] 6022 lea dstq, [dstq+strideq*8] 6023 jmp m(idct_8x8_internal).end 6024 6025.end2: 6026 LOAD_8ROWS rsp+gprsize+16*3, 16 6027 mova [rsp+gprsize+16*0], m7 6028 lea tx2q, [o(m(idct_64x16_internal).end3)] 6029 mov dstq, r3 6030 jmp m(idct_8x8_internal).end 6031 6032.end3: 6033 6034 add coeffq, 16*16 6035 mov r3d, [rsp+gprsize*1+16*67] 6036 mov dstq, [rsp+gprsize*2+16*67] 6037 add dstq, 8 6038 mov [rsp+gprsize*2+16*67], dstq 6039 dec r3d 6040 jg .pass2_loop2 6041 ret 6042 6043 6044cglobal inv_txfm_add_dct_dct_32x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 6045%if ARCH_X86_32 6046 LEA r5, $$ 6047%endif 6048 test eobd, eobd 6049 jz .dconly 6050 6051 call m(idct_32x64_internal) 6052 RET 6053 6054.dconly: 6055 movd m1, [o(pw_2896x8)] 6056 pmulhrsw m0, m1, [coeffq] 6057 movd m2, [o(pw_16384)] 6058 mov [coeffq], eobd 6059 pmulhrsw m0, m1 6060 mov r3d, 64 6061 lea tx2q, [o(m(inv_txfm_add_dct_dct_32x64).end)] 6062 jmp m(inv_txfm_add_dct_dct_32x8).body 6063 6064.end: 6065 RET 6066 6067 6068cglobal idct_32x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 6069 %undef cmp 6070 6071 mov r4d, 2 6072 sub eobd, 136 6073 mov [rsp+gprsize*1+16*67], eobd 6074 mov r3d, 4 6075 cmovs r3d, r4d 6076 6077%if ARCH_X86_32 6078 LEA r5, $$ 6079%endif 6080 6081 mov [rsp+gprsize*2+16*67], coeffq 6082 6083.pass1_loop: 6084 LOAD_8ROWS coeffq+64*1, 64*2, 1 6085 mova [rsp+gprsize+16*19], m0 ;in1 6086 mova [rsp+gprsize+16*26], m1 ;in3 6087 mova [rsp+gprsize+16*23], m2 ;in5 6088 mova [rsp+gprsize+16*22], m3 ;in7 6089 mova [rsp+gprsize+16*21], m4 ;in9 6090 mova [rsp+gprsize+16*24], m5 ;in11 6091 mova [rsp+gprsize+16*25], m6 ;in13 6092 mova [rsp+gprsize+16*20], m7 ;in15 6093 6094 mov tx2d, [rsp+gprsize*1+16*67] 6095 test tx2d, tx2d 6096 jl .fast 6097 6098.full: 6099 LOAD_8ROWS coeffq+64*0, 64*4, 1 6100 call m(idct_8x8_internal).main 6101 SAVE_7ROWS rsp+gprsize+16*3, 16 6102 LOAD_8ROWS coeffq+64*2, 64*4, 1 6103 call m(idct_16x8_internal).main 6104 mova m7, [rsp+gprsize+16*0] 6105 SAVE_8ROWS rsp+gprsize+16*11, 16 6106 6107 LOAD_8ROWS coeffq+64*17, 64*2, 1 6108 mova [rsp+gprsize+16*33], m0 ;in17 6109 mova [rsp+gprsize+16*28], m1 ;in19 6110 mova [rsp+gprsize+16*29], m2 ;in21 6111 mova [rsp+gprsize+16*32], m3 ;in23 6112 mova [rsp+gprsize+16*31], m4 ;in25 6113 mova [rsp+gprsize+16*30], m5 ;in27 6114 mova [rsp+gprsize+16*27], m6 ;in29 6115 mova [rsp+gprsize+16*34], m7 ;in31 6116 6117 call m(idct_8x32_internal).main 6118 jmp .pass1_end 6119 6120.fast: 6121 LOAD_4ROWS coeffq, 256, 1 6122 pxor m4, m4 6123 REPX {mova x, m4}, m5, m6, m7 6124 call m(idct_8x8_internal).main 6125 6126 SAVE_7ROWS rsp+gprsize+16*3, 16 6127 LOAD_4ROWS coeffq+128*1, 256, 1 6128 pxor m4, m4 6129 REPX {mova x, m4}, m5, m6, m7 6130 call m(idct_16x8_internal).main 6131 mova m7, [rsp+gprsize+16*0] 6132 SAVE_8ROWS rsp+gprsize+16*11, 16 6133 6134 call m(idct_8x32_internal).main_fast 6135 6136.pass1_end: 6137 mova [rsp+gprsize+16*0], m7 6138 lea tx2q, [o(m(idct_32x64_internal).pass1_end1)] 6139 jmp m(idct_8x8_internal).pass1_end 6140 6141.pass1_end1: 6142 SAVE_8ROWS coeffq+64*0, 64 6143 LOAD_8ROWS rsp+gprsize+16*11, 16 6144 mova [rsp+gprsize+16*0], m7 6145 lea tx2q, [o(m(idct_32x64_internal).pass1_end2)] 6146 jmp m(idct_8x8_internal).pass1_end 6147 6148.pass1_end2: 6149 SAVE_8ROWS coeffq+64*8, 64 6150 LOAD_8ROWS rsp+gprsize+16*19, 16 6151 mova [rsp+gprsize+16*0], m7 6152 lea tx2q, [o(m(idct_32x64_internal).pass1_end3)] 6153 jmp m(idct_8x8_internal).pass1_end 6154 6155.pass1_end3: 6156 SAVE_8ROWS coeffq+64*16, 64 6157 LOAD_8ROWS rsp+gprsize+16*27, 16 6158 mova [rsp+gprsize+16*0], m7 6159 lea tx2q, [o(m(idct_32x64_internal).pass1_end4)] 6160 jmp m(idct_8x8_internal).pass1_end 6161 6162.pass1_end4: 6163 SAVE_8ROWS coeffq+64*24, 64 6164 6165 add coeffq, 16 6166 dec r3d 6167 jg .pass1_loop 6168 6169.pass2: 6170 mov coeffq, [rsp+gprsize*2+16*67] 6171 mov r3d, 4 6172 lea r4, [dstq+8] 6173 mov [rsp+gprsize*2+16*67], r4 6174 lea r4, [o(m(idct_16x64_internal).end1)] 6175 jmp m(idct_16x64_internal).pass2_loop 6176 6177 6178cglobal inv_txfm_add_dct_dct_64x32, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 6179%if ARCH_X86_32 6180 LEA r5, $$ 6181%endif 6182 test eobd, eobd 6183 jz .dconly 6184 6185 call m(idct_64x32_internal) 6186 RET 6187 6188.dconly: 6189 movd m1, [o(pw_2896x8)] 6190 pmulhrsw m0, m1, [coeffq] 6191 movd m2, [o(pw_16384)] 6192 pmulhrsw m0, m1 6193 mov [coeffq], eobd 6194 mov r3d, 32 6195 lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32).end)] 6196 jmp m(inv_txfm_add_dct_dct_64x16).body 6197 6198.end: 6199 RET 6200 6201cglobal idct_64x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 6202 %undef cmp 6203 6204 mov r4d, 2 6205 sub eobd, 136 6206 mov [rsp+gprsize*1+16*67], eobd 6207 mov r3d, 4 6208 cmovs r3d, r4d 6209 6210%if ARCH_X86_32 6211 LEA r5, $$ 6212%endif 6213 6214 mov [rsp+gprsize*2+16*67], coeffq 6215 mov [rsp+gprsize*3+16*67], dstq 6216 lea dstq, [rsp+gprsize+16*69] 6217 mov [rsp+gprsize*4+16*67], dstq 6218 6219.pass1_loop: 6220 LOAD_4ROWS coeffq+64*0, 64*8, 1 6221 pxor m4, m4 6222 REPX {mova x, m4}, m5, m6, m7 6223 call m(idct_8x8_internal).main 6224 SAVE_7ROWS rsp+gprsize+16*3, 16 6225 6226 pxor m4, m4 6227 LOAD_4ROWS coeffq+64*4, 64*8, 1 6228 6229 REPX {mova x, m4}, m5, m6, m7 6230 call m(idct_16x8_internal).main 6231 mova m7, [rsp+gprsize+16*0] 6232 SAVE_8ROWS rsp+gprsize+16*11, 16 6233 6234 LOAD_8ROWS coeffq+64*2, 64*4, 1 6235 mova [rsp+gprsize+16*19], m0 6236 mova [rsp+gprsize+16*26], m1 6237 mova [rsp+gprsize+16*23], m2 6238 mova [rsp+gprsize+16*22], m3 6239 mova [rsp+gprsize+16*21], m4 6240 mova [rsp+gprsize+16*24], m5 6241 mova [rsp+gprsize+16*25], m6 6242 mova [rsp+gprsize+16*20], m7 6243 6244 call m(idct_8x32_internal).main_fast 6245 SAVE_8ROWS rsp+gprsize+16*3, 16 6246 6247 LOAD_8ROWS coeffq+64*1, 64*2, 1 6248 mova [rsp+gprsize+16*35], m0 ;in1 6249 mova [rsp+gprsize+16*49], m1 ;in3 6250 mova [rsp+gprsize+16*43], m2 ;in5 6251 mova [rsp+gprsize+16*41], m3 ;in7 6252 mova [rsp+gprsize+16*39], m4 ;in9 6253 mova [rsp+gprsize+16*45], m5 ;in11 6254 mova [rsp+gprsize+16*47], m6 ;in13 6255 mova [rsp+gprsize+16*37], m7 ;in15 6256 6257 LOAD_8ROWS coeffq+64*17, 64*2, 1 6258 mova [rsp+gprsize+16*63], m0 ;in17 6259 mova [rsp+gprsize+16*53], m1 ;in19 6260 mova [rsp+gprsize+16*55], m2 ;in21 6261 mova [rsp+gprsize+16*61], m3 ;in23 6262 mova [rsp+gprsize+16*59], m4 ;in25 6263 mova [rsp+gprsize+16*57], m5 ;in27 6264 mova [rsp+gprsize+16*51], m6 ;in29 6265 mova [rsp+gprsize+16*65], m7 ;in31 6266 6267 call m(idct_16x64_internal).main 6268 6269 LOAD_8ROWS rsp+gprsize+16*3, 16 6270 mova [rsp+gprsize+16*0], m7 6271 lea tx2q, [o(m(idct_64x32_internal).pass1_end)] 6272 jmp m(idct_8x8_internal).pass1_end 6273 6274.pass1_end: 6275 SAVE_8ROWS coeffq+64*0, 64 6276 LOAD_8ROWS rsp+gprsize+16*11, 16 6277 mova [rsp+gprsize+16*0], m7 6278 lea tx2q, [o(m(idct_64x32_internal).pass1_end1)] 6279 jmp m(idct_8x8_internal).pass1_end 6280 6281.pass1_end1: 6282 SAVE_8ROWS coeffq+64*8, 64 6283 LOAD_8ROWS rsp+gprsize+16*19, 16 6284 mova [rsp+gprsize+16*0], m7 6285 lea tx2q, [o(m(idct_64x32_internal).pass1_end2)] 6286 jmp m(idct_8x8_internal).pass1_end 6287 6288.pass1_end2: 6289 SAVE_8ROWS coeffq+64*16, 64 6290 LOAD_8ROWS rsp+gprsize+16*27, 16 6291 mova [rsp+gprsize+16*0], m7 6292 lea tx2q, [o(m(idct_64x32_internal).pass1_end3)] 6293 jmp m(idct_8x8_internal).pass1_end 6294 6295.pass1_end3: 6296 SAVE_8ROWS coeffq+64*24, 64 6297 LOAD_8ROWS rsp+gprsize+16*35, 16 6298 mova [rsp+gprsize+16*0], m7 6299 lea tx2q, [o(m(idct_64x32_internal).pass1_end4)] 6300 jmp m(idct_8x8_internal).pass1_end 6301 6302.pass1_end4: 6303 SAVE_8ROWS dstq+64*0, 64 6304 LOAD_8ROWS rsp+gprsize+16*43, 16 6305 mova [rsp+gprsize+16*0], m7 6306 lea tx2q, [o(m(idct_64x32_internal).pass1_end5)] 6307 jmp m(idct_8x8_internal).pass1_end 6308 6309.pass1_end5: 6310 SAVE_8ROWS dstq+64*8, 64 6311 LOAD_8ROWS rsp+gprsize+16*51, 16 6312 mova [rsp+gprsize+16*0], m7 6313 lea tx2q, [o(m(idct_64x32_internal).pass1_end6)] 6314 jmp m(idct_8x8_internal).pass1_end 6315 6316.pass1_end6: 6317 SAVE_8ROWS dstq+64*16, 64 6318 LOAD_8ROWS rsp+gprsize+16*59, 16 6319 mova [rsp+gprsize+16*0], m7 6320 lea tx2q, [o(m(idct_64x32_internal).pass1_end7)] 6321 jmp m(idct_8x8_internal).pass1_end 6322 6323.pass1_end7: 6324 SAVE_8ROWS dstq+64*24, 64 6325 6326 add coeffq, 16 6327 add dstq, 16 6328 dec r3d 6329 jg .pass1_loop 6330 6331.pass2: 6332 mov coeffq, [rsp+gprsize*4+16*67] 6333 mov dstq, [rsp+gprsize*3+16*67] 6334 mov eobd, [rsp+gprsize*1+16*67] 6335 lea dstq, [dstq+32] 6336 mov [rsp+gprsize*1+16*35], eobd 6337 lea tx2q, [o(m(idct_64x32_internal).pass2_end)] 6338 mov r3d, 4 6339 jmp m(idct_32x32_internal).pass2_loop 6340 6341.pass2_end: 6342 mova [rsp+gprsize+16*0], m7 6343 lea r3, [o(m(idct_64x32_internal).pass2_end1)] 6344 jmp m(idct_8x32_internal).end2 6345 6346.pass2_end1: 6347 lea tx2q, [o(m(idct_64x32_internal).pass2_end)] 6348 add coeffq, 16*32 6349 mov dstq, [rsp+gprsize*2+16*35] 6350 mov r3d, [rsp+gprsize*3+16*35] 6351 dec r3d 6352 jg m(idct_32x32_internal).pass2_loop 6353 6354.pass2_end2: 6355 mov dstq, [rsp+gprsize*3+16*67] 6356 mov coeffq, [rsp+gprsize*2+16*67] 6357 lea tx2q, [o(m(idct_32x32_internal).pass2_end)] 6358 mov r3d, 4 6359 jmp m(idct_32x32_internal).pass2_loop 6360 6361 6362cglobal inv_txfm_add_dct_dct_64x64, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 6363%if ARCH_X86_32 6364 LEA r5, $$ 6365%endif 6366 test eobd, eobd 6367 jz .dconly 6368 6369 call m(idct_64x64_internal) 6370 RET 6371 6372.dconly: 6373 movd m1, [o(pw_2896x8)] 6374 pmulhrsw m0, m1, [coeffq] 6375 movd m2, [o(pw_8192)] 6376 mov [coeffq], eobd 6377 mov r3d, 64 6378 lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32).end)] 6379 jmp m(inv_txfm_add_dct_dct_64x16).body 6380 6381cglobal idct_64x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 6382 %undef cmp 6383 6384 mov r5d, 4 6385 mov r4d, 2 6386 sub eobd, 136 6387 cmovns r4d, r5d 6388 6389%if ARCH_X86_32 6390 LEA r5, $$ 6391%endif 6392 6393 mov [rsp+gprsize*1+16*67], eobd 6394 mov r3d, r4d 6395 mov [rsp+gprsize*4+16*67], coeffq 6396 mov [rsp+gprsize*3+16*67], dstq 6397 lea dstq, [rsp+gprsize+16*69] 6398 mov [rsp+gprsize*2+16*67], dstq 6399 6400.pass1_loop: 6401 LOAD_4ROWS coeffq+64*0, 64*8 6402 pxor m4, m4 6403 REPX {mova x, m4}, m5, m6, m7 6404 call m(idct_8x8_internal).main 6405 SAVE_7ROWS rsp+gprsize+16*3, 16 6406 6407 pxor m4, m4 6408 LOAD_4ROWS coeffq+64*4, 64*8 6409 6410 REPX {mova x, m4}, m5, m6, m7 6411 call m(idct_16x8_internal).main 6412 mova m7, [rsp+gprsize+16*0] 6413 SAVE_8ROWS rsp+gprsize+16*11, 16 6414 6415 LOAD_8ROWS coeffq+64*2, 64*4 6416 mova [rsp+gprsize+16*19], m0 6417 mova [rsp+gprsize+16*26], m1 6418 mova [rsp+gprsize+16*23], m2 6419 mova [rsp+gprsize+16*22], m3 6420 mova [rsp+gprsize+16*21], m4 6421 mova [rsp+gprsize+16*24], m5 6422 mova [rsp+gprsize+16*25], m6 6423 mova [rsp+gprsize+16*20], m7 6424 6425 call m(idct_8x32_internal).main_fast 6426 SAVE_8ROWS rsp+gprsize+16*3, 16 6427 6428 LOAD_8ROWS coeffq+64*1, 64*2 6429 mova [rsp+gprsize+16*35], m0 ;in1 6430 mova [rsp+gprsize+16*49], m1 ;in3 6431 mova [rsp+gprsize+16*43], m2 ;in5 6432 mova [rsp+gprsize+16*41], m3 ;in7 6433 mova [rsp+gprsize+16*39], m4 ;in9 6434 mova [rsp+gprsize+16*45], m5 ;in11 6435 mova [rsp+gprsize+16*47], m6 ;in13 6436 mova [rsp+gprsize+16*37], m7 ;in15 6437 6438 LOAD_8ROWS coeffq+64*17, 64*2 6439 mova [rsp+gprsize+16*63], m0 ;in17 6440 mova [rsp+gprsize+16*53], m1 ;in19 6441 mova [rsp+gprsize+16*55], m2 ;in21 6442 mova [rsp+gprsize+16*61], m3 ;in23 6443 mova [rsp+gprsize+16*59], m4 ;in25 6444 mova [rsp+gprsize+16*57], m5 ;in27 6445 mova [rsp+gprsize+16*51], m6 ;in29 6446 mova [rsp+gprsize+16*65], m7 ;in31 6447 6448 call m(idct_16x64_internal).main 6449 6450 LOAD_8ROWS rsp+gprsize+16*3, 16 6451 mova [rsp+gprsize+16*0], m7 6452 mova m7, [o(pw_8192)] 6453 lea tx2q, [o(m(idct_64x64_internal).pass1_end)] 6454 jmp m(idct_8x8_internal).pass1_end1 6455 6456.pass1_end: 6457 SAVE_8ROWS coeffq+64*0, 64 6458 LOAD_8ROWS rsp+gprsize+16*11, 16 6459 mova [rsp+gprsize+16*0], m7 6460 mova m7, [o(pw_8192)] 6461 lea tx2q, [o(m(idct_64x64_internal).pass1_end1)] 6462 jmp m(idct_8x8_internal).pass1_end1 6463 6464.pass1_end1: 6465 SAVE_8ROWS coeffq+64*8, 64 6466 LOAD_8ROWS rsp+gprsize+16*19, 16 6467 mova [rsp+gprsize+16*0], m7 6468 mova m7, [o(pw_8192)] 6469 lea tx2q, [o(m(idct_64x64_internal).pass1_end2)] 6470 jmp m(idct_8x8_internal).pass1_end1 6471 6472.pass1_end2: 6473 SAVE_8ROWS coeffq+64*16, 64 6474 LOAD_8ROWS rsp+gprsize+16*27, 16 6475 mova [rsp+gprsize+16*0], m7 6476 mova m7, [o(pw_8192)] 6477 lea tx2q, [o(m(idct_64x64_internal).pass1_end3)] 6478 jmp m(idct_8x8_internal).pass1_end1 6479 6480.pass1_end3: 6481 SAVE_8ROWS coeffq+64*24, 64 6482 LOAD_8ROWS rsp+gprsize+16*35, 16 6483 mova [rsp+gprsize+16*0], m7 6484 mova m7, [o(pw_8192)] 6485 lea tx2q, [o(m(idct_64x64_internal).pass1_end4)] 6486 jmp m(idct_8x8_internal).pass1_end1 6487 6488.pass1_end4: 6489 SAVE_8ROWS dstq+64*0, 64 6490 LOAD_8ROWS rsp+gprsize+16*43, 16 6491 mova [rsp+gprsize+16*0], m7 6492 mova m7, [o(pw_8192)] 6493 lea tx2q, [o(m(idct_64x64_internal).pass1_end5)] 6494 jmp m(idct_8x8_internal).pass1_end1 6495 6496.pass1_end5: 6497 SAVE_8ROWS dstq+64*8, 64 6498 LOAD_8ROWS rsp+gprsize+16*51, 16 6499 mova [rsp+gprsize+16*0], m7 6500 mova m7, [o(pw_8192)] 6501 lea tx2q, [o(m(idct_64x64_internal).pass1_end6)] 6502 jmp m(idct_8x8_internal).pass1_end1 6503 6504.pass1_end6: 6505 SAVE_8ROWS dstq+64*16, 64 6506 LOAD_8ROWS rsp+gprsize+16*59, 16 6507 mova [rsp+gprsize+16*0], m7 6508 mova m7, [o(pw_8192)] 6509 lea tx2q, [o(m(idct_64x64_internal).pass1_end7)] 6510 jmp m(idct_8x8_internal).pass1_end1 6511 6512.pass1_end7: 6513 SAVE_8ROWS dstq+64*24, 64 6514 6515 add coeffq, 16 6516 add dstq, 16 6517 dec r3d 6518 jg .pass1_loop 6519 6520.pass2: 6521 mov dstq, [rsp+gprsize*3+16*67] 6522 mov coeffq, [rsp+gprsize*2+16*67] 6523 lea dstq, [dstq+32] 6524 mov r3d, 4 6525 lea r4, [dstq+8] 6526 mov [rsp+gprsize*2+16*67], r4 6527 lea r4, [o(m(idct_64x64_internal).pass2_end)] 6528 jmp m(idct_16x64_internal).pass2_loop 6529 6530.pass2_end: 6531 LOAD_8ROWS rsp+gprsize+16*35, 16 6532 lea dstq, [dstq+strideq*2] 6533 add rsp, 16*32 6534 mova [rsp+gprsize+16*0], m7 6535 lea r3, [o(m(idct_64x64_internal).pass2_end1)] 6536 jmp m(idct_8x32_internal).end2 6537 6538.pass2_end1: 6539 add coeffq, 16*32 6540 sub rsp, 16*32 6541 6542 mov dstq, [rsp+gprsize*2+16*67] 6543 mov r3d, [rsp+gprsize*3+16*67] 6544 lea r4, [dstq+8] 6545 mov [rsp+gprsize*2+16*67], r4 6546 lea r4, [o(m(idct_64x64_internal).pass2_end)] 6547 6548 dec r3d 6549 jg m(idct_16x64_internal).pass2_loop 6550 6551.pass2_end2: 6552 mov coeffq, [rsp+gprsize*4+16*67] 6553 mov dstq, [rsp+gprsize*2+16*67] 6554 mov r3d, 4 6555 sub dstq, 72 6556 lea r4, [dstq+8] 6557 mov [rsp+gprsize*2+16*67], r4 6558 lea r4, [o(m(idct_16x64_internal).end1)] 6559 jmp m(idct_16x64_internal).pass2_loop 6560