1; Copyright © 2018-2021, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29 30SECTION_RODATA 16 31 32deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 33 34deint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 35deint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 36 37%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1 38pw_%1_m%2: times 4 dw %1, -%2 39%if %3 != 2 40pw_%2_%1: times 4 dw %2, %1 41%endif 42%if %3 43pw_m%1_m%2: times 4 dw -%1, -%2 44%endif 45%endmacro 46 47;adst4 48pw_1321_3803: times 4 dw 1321, 3803 49pw_2482_m1321: times 4 dw 2482, -1321 50pw_3344_2482: times 4 dw 3344, 2482 51pw_3344_m3803: times 4 dw 3344, -3803 52pw_3344_m3344: times 4 dw 3344, -3344 53pw_0_3344 times 4 dw 0, 3344 54pw_m6688_m3803: times 4 dw -6688, -3803 55 56COEF_PAIR 2896, 2896 57COEF_PAIR 1567, 3784 58COEF_PAIR 799, 4017 59COEF_PAIR 3406, 2276 60COEF_PAIR 401, 4076 61COEF_PAIR 1931, 3612 62COEF_PAIR 3166, 2598 63COEF_PAIR 3920, 1189 64COEF_PAIR 3784, 1567, 1 65COEF_PAIR 995, 3973 66COEF_PAIR 1751, 3703 67COEF_PAIR 3513, 2106 68COEF_PAIR 3857, 1380 69COEF_PAIR 4017, 799, 1 70COEF_PAIR 201, 4091 71COEF_PAIR 2440, 3290 72COEF_PAIR 3035, 2751 73COEF_PAIR 4052, 601 74COEF_PAIR 2276, 3406, 1 75COEF_PAIR 4076, 401, 2 76COEF_PAIR 2598, 3166, 2 77COEF_PAIR 3612, 1931, 2 78COEF_PAIR 1189, 3920, 2 79 80pd_2048: times 4 dd 2048 81pw_2048: times 8 dw 2048 82pw_m2048: times 8 dw -2048 83pw_4096: times 8 dw 4096 84pw_16384: times 8 dw 16384 85pw_m16384: times 8 dw -16384 86pw_1697x16: times 8 dw 1697*16 87pw_1697x8: times 8 dw 1697*8 88pw_2896x8: times 8 dw 2896*8 89pw_3344x8: times 8 dw 3344*8 90pw_8192: times 8 dw 8192 91pw_m8192: times 8 dw -8192 92pw_5: times 8 dw 5 93pw_201x8: times 8 dw 201*8 94pw_4091x8: times 8 dw 4091*8 95pw_m2751x8: times 8 dw -2751*8 96pw_3035x8: times 8 dw 3035*8 97pw_1751x8: times 8 dw 1751*8 98pw_3703x8: times 8 dw 3703*8 99pw_m1380x8: times 8 dw -1380*8 100pw_3857x8: times 8 dw 3857*8 101pw_995x8: times 8 dw 995*8 102pw_3973x8: times 8 dw 3973*8 103pw_m2106x8: times 8 dw -2106*8 104pw_3513x8: times 8 dw 3513*8 105pw_2440x8: times 8 dw 2440*8 106pw_3290x8: times 8 dw 3290*8 107pw_m601x8: times 8 dw -601*8 108pw_4052x8: times 8 dw 4052*8 109 110pw_4095x8: times 8 dw 4095*8 111pw_101x8: times 8 dw 101*8 112pw_2967x8: times 8 dw 2967*8 113pw_m2824x8: times 8 dw -2824*8 114pw_3745x8: times 8 dw 3745*8 115pw_1660x8: times 8 dw 1660*8 116pw_3822x8: times 8 dw 3822*8 117pw_m1474x8: times 8 dw -1474*8 118pw_3996x8: times 8 dw 3996*8 119pw_897x8: times 8 dw 897*8 120pw_3461x8: times 8 dw 3461*8 121pw_m2191x8: times 8 dw -2191*8 122pw_3349x8: times 8 dw 3349*8 123pw_2359x8: times 8 dw 2359*8 124pw_4036x8: times 8 dw 4036*8 125pw_m700x8: times 8 dw -700*8 126pw_4065x8: times 8 dw 4065*8 127pw_501x8: times 8 dw 501*8 128pw_3229x8: times 8 dw 3229*8 129pw_m2520x8: times 8 dw -2520*8 130pw_3564x8: times 8 dw 3564*8 131pw_2019x8: times 8 dw 2019*8 132pw_3948x8: times 8 dw 3948*8 133pw_m1092x8: times 8 dw -1092*8 134pw_3889x8: times 8 dw 3889*8 135pw_1285x8: times 8 dw 1285*8 136pw_3659x8: times 8 dw 3659*8 137pw_m1842x8: times 8 dw -1842*8 138pw_3102x8: times 8 dw 3102*8 139pw_2675x8: times 8 dw 2675*8 140pw_4085x8: times 8 dw 4085*8 141pw_m301x8: times 8 dw -301*8 142 143SECTION .text 144 145%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) 146 147%if ARCH_X86_64 148%define o(x) x 149%else 150%define o(x) r5-$$+x ; PIC 151%endif 152 153%macro WRITE_4X4 9 ;src[1-2], tmp[1-3], row[1-4] 154 lea r2, [dstq+strideq*2] 155%assign %%i 1 156%rotate 5 157%rep 4 158 %if %1 & 2 159 CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) 160 %else 161 CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) 162 %endif 163 %assign %%i %%i + 1 164 %rotate 1 165%endrep 166 167 movd m%3, [%%row_adr1] ;dst0 168 movd m%5, [%%row_adr2] ;dst1 169 punpckldq m%3, m%5 ;high: dst1 :low: dst0 170 movd m%4, [%%row_adr3] ;dst2 171 movd m%5, [%%row_adr4] ;dst3 172 punpckldq m%4, m%5 ;high: dst3 :low: dst2 173 174 pxor m%5, m%5 175 punpcklbw m%3, m%5 ;extend byte to word 176 punpcklbw m%4, m%5 ;extend byte to word 177 178 paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0 179 paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2 180 181 packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0 182 183 movd [%%row_adr1], m%3 ;store dst0 + out0 184 pshuflw m%4, m%3, q1032 185 movd [%%row_adr2], m%4 ;store dst1 + out1 186 punpckhqdq m%3, m%3 187 movd [%%row_adr3], m%3 ;store dst2 + out2 188 psrlq m%3, 32 189 movd [%%row_adr4], m%3 ;store dst3 + out3 190%endmacro 191 192%macro ITX4_END 4-5 2048 ; row[1-4], rnd 193%if %5 194 mova m2, [o(pw_%5)] 195 pmulhrsw m0, m2 196 pmulhrsw m1, m2 197%endif 198 199 WRITE_4X4 0, 1, 2, 3, 4, %1, %2, %3, %4 200 ret 201%endmacro 202 203; flags: 1 = swap, 2: coef_regs, 4: no_pack 204%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags 205%if %6 & 2 206 pmaddwd m%2, m%4, m%1 207 pmaddwd m%1, m%5 208%elif %6 & 1 209 pmaddwd m%2, m%1, [o(pw_%5_%4)] 210 pmaddwd m%1, [o(pw_%4_m%5)] 211%else 212 pmaddwd m%2, m%1, [o(pw_%4_m%5)] 213 pmaddwd m%1, [o(pw_%5_%4)] 214%endif 215 paddd m%2, m%3 216 paddd m%1, m%3 217 psrad m%2, 12 218 psrad m%1, 12 219%if %6 & 4 == 0 220 packssdw m%1, m%2 221%endif 222%endmacro 223 224%macro IDCT4_1D_PACKED 0-1 ;pw_2896x8 225 mova m3, [o(pd_2048)] 226 punpckhwd m2, m0, m1 ;unpacked in1 in3 227 punpcklwd m0, m1 ;unpacked in0 in2 228 ITX_MUL2X_PACK 2, 1, 3, 1567, 3784 229 ITX_MUL2X_PACK 0, 1, 3, 2896, 2896 230 psubsw m1, m0, m2 ;high: out2 ;low: out3 231 paddsw m0, m2 ;high: out1 ;low: out0 232%endmacro 233 234%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack 235cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2 236 %define %%p1 m(i%1_%3_internal_8bpc) 237%if ARCH_X86_32 238 LEA r5, $$ 239%endif 240%if has_epilogue 241%ifidn %1_%2, dct_dct 242 test eobd, eobd 243 jz %%end 244%endif 245 lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] 246 call %%p1 247 RET 248%%end: 249%else 250 lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] 251%ifidn %1_%2, dct_dct 252 test eobd, eobd 253 jnz %%p1 254%else 255 times ((%%end - %%p1) >> 31) & 1 jmp %%p1 256ALIGN function_align 257%%end: 258%endif 259%endif 260%endmacro 261 262%macro INV_TXFM_4X4_FN 2 ; type1, type2 263 INV_TXFM_FN %1, %2, 4x4, 6 264%ifidn %1_%2, dct_dct 265 pshuflw m0, [coeffq], q0000 266 punpcklqdq m0, m0 267 mova m1, [o(pw_2896x8)] 268 pmulhrsw m0, m1 269 mov [coeffq], eobd ;0 270 pmulhrsw m0, m1 271 mova m1, m0 272 TAIL_CALL m(iadst_4x4_internal_8bpc).end2 273%endif 274%endmacro 275 276INIT_XMM ssse3 277; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16. 278 279INV_TXFM_4X4_FN dct, dct 280INV_TXFM_4X4_FN dct, adst 281INV_TXFM_4X4_FN dct, flipadst 282INV_TXFM_4X4_FN dct, identity 283 284cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 285 mova m0, [coeffq+16*0] ;high: in1 ;low: in0 286 mova m1, [coeffq+16*1] ;high: in3 ;low in2 287 288 IDCT4_1D_PACKED 289 290 mova m2, [o(deint_shuf)] 291 shufps m3, m0, m1, q1331 292 shufps m0, m1, q0220 293 pshufb m0, m2 ;high: in1 ;low: in0 294 pshufb m1, m3, m2 ;high: in3 ;low :in2 295 jmp tx2q 296 297.pass2: 298 IDCT4_1D_PACKED 299 300 pxor m2, m2 301 mova [coeffq+16*0], m2 302 mova [coeffq+16*1], m2 ;memset(coeff, 0, sizeof(*coeff) * sh * sw); 303 304 ITX4_END 0, 1, 3, 2 305 306INV_TXFM_4X4_FN adst, dct 307INV_TXFM_4X4_FN adst, adst 308INV_TXFM_4X4_FN adst, flipadst 309INV_TXFM_4X4_FN adst, identity 310 311cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 312 mova m0, [coeffq+16*0] 313 mova m1, [coeffq+16*1] 314 call .main 315 punpckhwd m2, m0, m1 316 punpcklwd m0, m1 317 punpckhwd m1, m0, m2 ;high: in3 ;low :in2 318 punpcklwd m0, m2 ;high: in1 ;low: in0 319 jmp tx2q 320 321.pass2: 322 call .main 323 324.end: 325 pxor m2, m2 326 mova [coeffq+16*0], m2 327 mova [coeffq+16*1], m2 328 329.end2: 330 ITX4_END 0, 1, 2, 3 331 332ALIGN function_align 333cglobal_label .main 334 punpcklwd m2, m0, m1 ;unpacked in0 in2 335 punpckhwd m0, m1 ;unpacked in1 in3 336 mova m3, m0 337 pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2 338 pmaddwd m0, [o(pw_0_3344)] ;3344 * in3 339 paddd m1, m0 ;t2 340 pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 341 pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 342 pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 343 pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3 344 paddd m4, m0 ;t0 + t3 345 pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 346 mova m0, [o(pd_2048)] 347 paddd m1, m0 ;t2 + 2048 348 paddd m2, m0 349 paddd m0, m4 ;t0 + t3 + 2048 350 paddd m5, m2 ;t1 + t3 + 2048 351 paddd m2, m4 352 paddd m2, m3 ;t0 + t1 - t3 + 2048 353 REPX {psrad x, 12}, m1, m0, m5, m2 354 packssdw m0, m5 ;high: out1 ;low: out0 355 packssdw m1, m2 ;high: out3 ;low: out3 356 ret 357 358INV_TXFM_4X4_FN flipadst, dct 359INV_TXFM_4X4_FN flipadst, adst 360INV_TXFM_4X4_FN flipadst, flipadst 361INV_TXFM_4X4_FN flipadst, identity 362 363cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 364 mova m0, [coeffq+16*0] 365 mova m1, [coeffq+16*1] 366 call m(iadst_4x4_internal_8bpc).main 367 punpcklwd m2, m1, m0 368 punpckhwd m1, m0 369 punpcklwd m0, m1, m2 ;high: in3 ;low :in2 370 punpckhwd m1, m2 ;high: in1 ;low: in0 371 jmp tx2q 372 373.pass2: 374 call m(iadst_4x4_internal_8bpc).main 375 376.end: 377 pxor m2, m2 378 mova [coeffq+16*0], m2 379 mova [coeffq+16*1], m2 380 381.end2: 382 ITX4_END 3, 2, 1, 0 383 384INV_TXFM_4X4_FN identity, dct 385INV_TXFM_4X4_FN identity, adst 386INV_TXFM_4X4_FN identity, flipadst 387INV_TXFM_4X4_FN identity, identity 388 389cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 390 mova m0, [coeffq+16*0] 391 mova m1, [coeffq+16*1] 392 mova m3, [o(pw_1697x8)] 393 pmulhrsw m2, m0, m3 394 pmulhrsw m3, m1 395 paddsw m0, m2 396 paddsw m1, m3 397 punpckhwd m2, m0, m1 398 punpcklwd m0, m1 399 punpckhwd m1, m0, m2 ;high: in3 ;low :in2 400 punpcklwd m0, m2 ;high: in1 ;low: in0 401 jmp tx2q 402 403.pass2: 404 mova m3, [o(pw_1697x8)] 405 pmulhrsw m2, m3, m0 406 pmulhrsw m3, m1 407 paddsw m0, m2 408 paddsw m1, m3 409 jmp m(iadst_4x4_internal_8bpc).end 410 411%macro IWHT4_1D_PACKED 0 412 punpckhqdq m3, m0, m1 ;low: in1 high: in3 413 punpcklqdq m0, m1 ;low: in0 high: in2 414 psubw m2, m0, m3 ;low: in0 - in1 high: in2 - in3 415 paddw m0, m3 ;low: in0 + in1 high: in2 + in3 416 punpckhqdq m2, m2 ;t2 t2 417 punpcklqdq m0, m0 ;t0 t0 418 psubw m1, m0, m2 419 psraw m1, 1 ;t4 t4 420 psubw m1, m3 ;low: t1/out2 high: t3/out1 421 psubw m0, m1 ;high: out0 422 paddw m2, m1 ;low: out3 423%endmacro 424 425INIT_XMM sse2 426cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff 427 mova m0, [coeffq+16*0] 428 mova m1, [coeffq+16*1] 429 pxor m2, m2 430 mova [coeffq+16*0], m2 431 mova [coeffq+16*1], m2 432 psraw m0, 2 433 psraw m1, 2 434 IWHT4_1D_PACKED 435 punpckhwd m0, m1 436 punpcklwd m3, m1, m2 437 punpckhdq m1, m0, m3 438 punpckldq m0, m3 439 IWHT4_1D_PACKED 440 shufpd m0, m2, 0x01 441 ITX4_END 0, 3, 2, 1, 0 442 443%macro IDCT8_1D_PACKED 0 444 mova m6, [o(pd_2048)] 445 punpckhwd m4, m0, m3 ;unpacked in1 in7 446 punpcklwd m0, m2 ;unpacked in0 in4 447 punpckhwd m2, m1 ;unpacked in5 in3 448 punpcklwd m1, m3 ;unpacked in2 in6 449 ITX_MUL2X_PACK 4, 3, 6, 799, 4017 ;low: t7a high: t4a 450 ITX_MUL2X_PACK 2, 3, 6, 3406, 2276 ;low: t6a high: t5a 451 ITX_MUL2X_PACK 1, 3, 6, 1567, 3784 ;low: t3 high: t2 452 psubsw m3, m4, m2 ;low: t6a high: t5a 453 paddsw m4, m2 ;low: t7 high: t4 454 pshufb m3, [o(deint_shuf1)] 455 ITX_MUL2X_PACK 0, 2, 6, 2896, 2896 ;low: t0 high: t1 456 ITX_MUL2X_PACK 3, 2, 6, 2896, 2896 ;low: t6 high: t5 457 psubsw m2, m0, m1 ;low: tmp3 high: tmp2 458 paddsw m0, m1 ;low: tmp0 high: tmp1 459 punpcklqdq m1, m4, m3 ;low: t7 high: t6 460 punpckhqdq m4, m3 ;low: t4 high: t5 461 psubsw m3, m0, m1 ;low: out7 high: out6 462 paddsw m0, m1 ;low: out0 high: out1 463 paddsw m1, m2, m4 ;low: out3 high: out2 464 psubsw m2, m4 ;low: out4 high: out5 465%endmacro 466 467;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 468;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 469%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1 470 punpckhwd m%4, m%1, m%2 471 punpcklwd m%1, m%2 472%if %7 < 8 473 pmaddwd m%2, m%7, m%1 474 pmaddwd m%3, m%7, m%4 475%else 476 mova m%2, [o(pw_%7_%6)] 477%if %8 478 pmaddwd m%3, m%1, m%2 479 pmaddwd m%2, m%4 480%else 481 pmaddwd m%3, m%4, m%2 482 pmaddwd m%2, m%1 483%endif 484%endif 485 paddd m%3, m%5 486 paddd m%2, m%5 487 psrad m%3, 12 488 psrad m%2, 12 489%if %8 490 packssdw m%3, m%2 491%else 492 packssdw m%2, m%3 ;dst2 493%endif 494%if %7 < 8 495 pmaddwd m%4, m%6 496 pmaddwd m%1, m%6 497%elif %8 498 mova m%2, [o(pw_%6_m%7)] 499 pmaddwd m%4, m%2 500 pmaddwd m%1, m%2 501%else 502 mova m%3, [o(pw_%6_m%7)] 503 pmaddwd m%4, m%3 504 pmaddwd m%1, m%3 505%endif 506 paddd m%4, m%5 507 paddd m%1, m%5 508 psrad m%4, 12 509 psrad m%1, 12 510 packssdw m%1, m%4 ;dst1 511%endmacro 512 513%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 514 ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3 515 ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0 516 psubsw m%3, m%1, m%2 ;out2 517 paddsw m%2, m%1 ;out1 518 paddsw m%1, m%5, m%4 ;out0 519 psubsw m%4, m%5 ;out3 520%endmacro 521 522%macro WRITE_4X8 4 ;row[1-4] 523 WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4 524 lea dstq, [dstq+strideq*4] 525 WRITE_4X4 2, 3, 4, 5, 6, %1, %2, %3, %4 526%endmacro 527 528%macro INV_4X8 0 529 punpckhwd m4, m2, m3 530 punpcklwd m2, m3 531 punpckhwd m3, m0, m1 532 punpcklwd m0, m1 533 punpckhdq m1, m0, m2 ;low: in2 high: in3 534 punpckldq m0, m2 ;low: in0 high: in1 535 punpckldq m2, m3, m4 ;low: in4 high: in5 536 punpckhdq m3, m4 ;low: in6 high: in7 537%endmacro 538 539%macro INV_TXFM_4X8_FN 2 ; type1, type2 540 INV_TXFM_FN %1, %2, 4x8, 8 541%ifidn %1_%2, dct_dct 542 pshuflw m0, [coeffq], q0000 543 punpcklqdq m0, m0 544 mova m1, [o(pw_2896x8)] 545 pmulhrsw m0, m1 546 mov [coeffq], eobd 547 pmulhrsw m0, m1 548 pmulhrsw m0, m1 549 pmulhrsw m0, [o(pw_2048)] 550 mova m1, m0 551 mova m2, m0 552 mova m3, m0 553 TAIL_CALL m(iadst_4x8_internal_8bpc).end3 554%endif 555%endmacro 556 557INIT_XMM ssse3 558INV_TXFM_4X8_FN dct, dct 559INV_TXFM_4X8_FN dct, adst 560INV_TXFM_4X8_FN dct, flipadst 561INV_TXFM_4X8_FN dct, identity 562 563cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 564 mova m3, [o(pw_2896x8)] 565 pmulhrsw m0, m3, [coeffq+16*0] 566 pmulhrsw m1, m3, [coeffq+16*1] 567 pmulhrsw m2, m3, [coeffq+16*2] 568 pmulhrsw m3, [coeffq+16*3] 569 570.pass1: 571 call m(idct_8x4_internal_8bpc).main 572 jmp m(iadst_4x8_internal_8bpc).pass1_end 573 574.pass2: 575 call .main 576 shufps m1, m1, q1032 577 shufps m3, m3, q1032 578 mova m4, [o(pw_2048)] 579 jmp m(iadst_4x8_internal_8bpc).end2 580 581ALIGN function_align 582cglobal_label .main 583 IDCT8_1D_PACKED 584 ret 585 586 587INV_TXFM_4X8_FN adst, dct 588INV_TXFM_4X8_FN adst, adst 589INV_TXFM_4X8_FN adst, flipadst 590INV_TXFM_4X8_FN adst, identity 591 592cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 593 mova m3, [o(pw_2896x8)] 594 pmulhrsw m0, m3, [coeffq+16*0] 595 pmulhrsw m1, m3, [coeffq+16*1] 596 pmulhrsw m2, m3, [coeffq+16*2] 597 pmulhrsw m3, [coeffq+16*3] 598 599.pass1: 600 call m(iadst_8x4_internal_8bpc).main 601 602.pass1_end: 603 INV_4X8 604 jmp tx2q 605 606.pass2: 607 shufps m0, m0, q1032 608 shufps m1, m1, q1032 609 call .main 610 mova m4, [o(pw_2048)] 611 pxor m5, m5 612 psubw m5, m4 613 614.end: 615 punpcklqdq m4, m5 616 617.end2: 618 pmulhrsw m0, m4 619 pmulhrsw m1, m4 620 pmulhrsw m2, m4 621 pmulhrsw m3, m4 622 pxor m5, m5 623 mova [coeffq+16*0], m5 624 mova [coeffq+16*1], m5 625 mova [coeffq+16*2], m5 626 mova [coeffq+16*3], m5 627 628.end3: 629 WRITE_4X8 0, 1, 2, 3 630 RET 631 632ALIGN function_align 633cglobal_label .main 634 mova m6, [o(pd_2048)] 635 punpckhwd m4, m3, m0 ;unpacked in7 in0 636 punpckhwd m5, m2, m1 ;unpacked in5 in2 637 punpcklwd m1, m2 ;unpacked in3 in4 638 punpcklwd m0, m3 ;unpacked in1 in6 639 ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a 640 ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a 641 ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a 642 ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a 643 644 psubsw m3, m4, m1 ;low: t4 high: t5 645 paddsw m4, m1 ;low: t0 high: t1 646 psubsw m2, m5, m0 ;low: t6 high: t7 647 paddsw m5, m0 ;low: t2 high: t3 648 649 shufps m1, m3, m2, q1032 650 punpckhwd m2, m1 651 punpcklwd m3, m1 652 ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a 653 ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a 654 655 psubsw m1, m4, m5 ;low: t2 high: t3 656 paddsw m4, m5 ;low: out0 high: -out7 657 psubsw m5, m3, m2 ;low: t7 high: t6 658 paddsw m3, m2 ;low: out6 high: -out1 659 shufps m0, m4, m3, q3210 ;low: out0 high: -out1 660 shufps m3, m4, q3210 ;low: out6 high: -out7 661 662 mova m2, [o(pw_2896_m2896)] 663 mova m7, [o(pw_2896_2896)] 664 shufps m4, m1, m5, q1032 ;low: t3 high: t7 665 shufps m1, m5, q3210 ;low: t2 high: t6 666 punpcklwd m5, m1, m4 667 punpckhwd m1, m4 668 pmaddwd m4, m2, m1 ;-out5 669 pmaddwd m2, m5 ; out4 670 pmaddwd m1, m7 ; out2 671 pmaddwd m5, m7 ;-out3 672 REPX {paddd x, m6}, m4, m2, m1, m5 673 REPX {psrad x, 12}, m4, m2, m1, m5 674 packssdw m1, m5 ;low: out2 high: -out3 675 packssdw m2, m4 ;low: out4 high: -out5 676 ret 677 678INV_TXFM_4X8_FN flipadst, dct 679INV_TXFM_4X8_FN flipadst, adst 680INV_TXFM_4X8_FN flipadst, flipadst 681INV_TXFM_4X8_FN flipadst, identity 682 683cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 684 mova m3, [o(pw_2896x8)] 685 pmulhrsw m0, m3, [coeffq+16*0] 686 pmulhrsw m1, m3, [coeffq+16*1] 687 pmulhrsw m2, m3, [coeffq+16*2] 688 pmulhrsw m3, [coeffq+16*3] 689 690.pass1: 691 call m(iadst_8x4_internal_8bpc).main 692 693 punpcklwd m4, m3, m2 694 punpckhwd m3, m2 695 punpcklwd m5, m1, m0 696 punpckhwd m1, m0 697 punpckldq m2, m3, m1 ;low: in4 high: in5 698 punpckhdq m3, m1 ;low: in6 high: in7 699 punpckldq m0, m4, m5 ;low: in0 high: in1 700 punpckhdq m1, m4, m5 ;low: in2 high: in3 701 jmp tx2q 702 703.pass2: 704 shufps m0, m0, q1032 705 shufps m1, m1, q1032 706 call m(iadst_4x8_internal_8bpc).main 707 708 mova m4, m0 709 mova m5, m1 710 pshufd m0, m3, q1032 711 pshufd m1, m2, q1032 712 pshufd m2, m5, q1032 713 pshufd m3, m4, q1032 714 mova m5, [o(pw_2048)] 715 pxor m4, m4 716 psubw m4, m5 717 jmp m(iadst_4x8_internal_8bpc).end 718 719INV_TXFM_4X8_FN identity, dct 720INV_TXFM_4X8_FN identity, adst 721INV_TXFM_4X8_FN identity, flipadst 722INV_TXFM_4X8_FN identity, identity 723 724cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 725 mova m3, [o(pw_2896x8)] 726 pmulhrsw m0, m3, [coeffq+16*0] 727 pmulhrsw m1, m3, [coeffq+16*1] 728 pmulhrsw m2, m3, [coeffq+16*2] 729 pmulhrsw m3, [coeffq+16*3] 730 731.pass1: 732 mova m7, [o(pw_1697x8)] 733 pmulhrsw m4, m7, m0 734 pmulhrsw m5, m7, m1 735 pmulhrsw m6, m7, m2 736 pmulhrsw m7, m3 737 paddsw m0, m4 738 paddsw m1, m5 739 paddsw m2, m6 740 paddsw m3, m7 741 jmp m(iadst_4x8_internal_8bpc).pass1_end 742 743.pass2: 744 mova m4, [o(pw_4096)] 745 jmp m(iadst_4x8_internal_8bpc).end2 746 747 748%macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3] 749 movq m%3, [dstq ] 750 movq m%4, [dstq+strideq] 751 pxor m%5, m%5 752 punpcklbw m%3, m%5 ;extend byte to word 753 punpcklbw m%4, m%5 ;extend byte to word 754%ifnum %1 755 paddw m%3, m%1 756%else 757 paddw m%3, %1 758%endif 759%ifnum %2 760 paddw m%4, m%2 761%else 762 paddw m%4, %2 763%endif 764 packuswb m%3, m%4 765 movq [dstq ], m%3 766 punpckhqdq m%3, m%3 767 movq [dstq+strideq], m%3 768%endmacro 769 770%macro WRITE_8X4 7 ;coefs[1-4], tmp[1-3] 771 WRITE_8X2 %1, %2, %5, %6, %7 772 lea dstq, [dstq+strideq*2] 773 WRITE_8X2 %3, %4, %5, %6, %7 774%endmacro 775 776%macro INV_TXFM_8X4_FN 2 ; type1, type2 777 INV_TXFM_FN %1, %2, 8x4, 8 778%ifidn %1_%2, dct_dct 779 pshuflw m0, [coeffq], q0000 780 punpcklqdq m0, m0 781 mova m1, [o(pw_2896x8)] 782 pmulhrsw m0, m1 783 pmulhrsw m0, m1 784 mova m2, [o(pw_2048)] 785 pmulhrsw m0, m1 786 pmulhrsw m0, m2 787 mova m1, m0 788 mova m2, m0 789 mova m3, m0 790 TAIL_CALL m(iadst_8x4_internal_8bpc).end2 791%endif 792%endmacro 793 794INV_TXFM_8X4_FN dct, dct 795INV_TXFM_8X4_FN dct, adst 796INV_TXFM_8X4_FN dct, flipadst 797INV_TXFM_8X4_FN dct, identity 798 799cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 800 mova m3, [o(pw_2896x8)] 801 pmulhrsw m0, m3, [coeffq+16*0] 802 pmulhrsw m1, m3, [coeffq+16*1] 803 pmulhrsw m2, m3, [coeffq+16*2] 804 pmulhrsw m3, [coeffq+16*3] 805 806 call m(idct_4x8_internal_8bpc).main 807 808 mova m4, [o(deint_shuf1)] 809 mova m5, [o(deint_shuf2)] 810 pshufb m0, m4 811 pshufb m1, m5 812 pshufb m2, m4 813 pshufb m3, m5 814 punpckhdq m4, m0, m1 815 punpckldq m0, m1 816 punpckhdq m5, m2, m3 817 punpckldq m2, m3 818 punpckhqdq m1, m0, m2 ;in1 819 punpcklqdq m0, m2 ;in0 820 punpckhqdq m3, m4, m5 ;in3 821 punpcklqdq m2 ,m4, m5 ;in2 822 jmp tx2q 823 824.pass2: 825 call .main 826 jmp m(iadst_8x4_internal_8bpc).end 827 828ALIGN function_align 829cglobal_label .main 830 mova m6, [o(pd_2048)] 831 IDCT4_1D 0, 1, 2, 3, 4, 5, 6 832 ret 833 834INV_TXFM_8X4_FN adst, dct 835INV_TXFM_8X4_FN adst, adst 836INV_TXFM_8X4_FN adst, flipadst 837INV_TXFM_8X4_FN adst, identity 838 839cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 840 mova m3, [o(pw_2896x8)] 841 pmulhrsw m0, m3, [coeffq+16*0] 842 pmulhrsw m1, m3, [coeffq+16*1] 843 pmulhrsw m2, m3, [coeffq+16*2] 844 pmulhrsw m3, [coeffq+16*3] 845 846 shufps m0, m0, q1032 847 shufps m1, m1, q1032 848 call m(iadst_4x8_internal_8bpc).main 849 850 punpckhwd m4, m0, m1 851 punpcklwd m0, m1 852 punpckhwd m1, m2, m3 853 punpcklwd m2, m3 854 pxor m5, m5 855 psubsw m3, m5, m1 856 psubsw m5, m4 857 punpckhdq m4, m5, m3 858 punpckldq m5, m3 859 punpckhdq m3, m0, m2 860 punpckldq m0, m2 861 punpckhwd m1, m0, m5 ;in1 862 punpcklwd m0, m5 ;in0 863 punpcklwd m2, m3, m4 ;in2 864 punpckhwd m3, m4 ;in3 865 jmp tx2q 866 867.pass2: 868 call .main 869 870.end: 871 mova m4, [o(pw_2048)] 872 pmulhrsw m0, m4 873 pmulhrsw m1, m4 874 pmulhrsw m2, m4 875 pmulhrsw m3, m4 876 877.end2: 878 pxor m6, m6 879 mova [coeffq+16*0], m6 880 mova [coeffq+16*1], m6 881 mova [coeffq+16*2], m6 882 mova [coeffq+16*3], m6 883.end3: 884 WRITE_8X4 0, 1, 2, 3, 4, 5, 6 885 RET 886 887ALIGN function_align 888cglobal_label .main 889 punpckhwd m6, m0, m2 ;unpacked in0 in2 890 punpcklwd m0, m2 ;unpacked in0 in2 891 punpckhwd m7, m1, m3 ;unpacked in1 in3 892 punpcklwd m1, m3 ;unpacked in1 in3 893 894 mova m2, [o(pw_3344_m3344)] 895 mova m4, [o(pw_0_3344)] 896 pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2 897 pmaddwd m5, m4, m7 ;3344 * in3 898 pmaddwd m2, m0 899 pmaddwd m4, m1 900 paddd m3, m5 901 paddd m2, m4 902 mova m4, [o(pd_2048)] 903 paddd m3, m4 ;t2 + 2048 904 paddd m2, m4 905 psrad m3, 12 906 psrad m2, 12 907 packssdw m2, m3 ;out2 908 909 pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 910 pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 911 pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 912 pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 913 paddd m3, m4 ;t0 + t3 914 915 pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 916 mova m4, [o(pd_2048)] 917 paddd m0, m4 918 paddd m4, m3 ;t0 + t3 + 2048 919 paddd m5, m0 ;t1 + t3 + 2048 920 paddd m3, m0 921 paddd m3, m1 ;t0 + t1 - t3 + 2048 922 923 psrad m4, 12 ;out0 924 psrad m5, 12 ;out1 925 psrad m3, 12 ;out3 926 packssdw m0, m4, m5 ;low: out0 high: out1 927 928 pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 929 pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 930 pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 931 pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 932 paddd m1, m4 ;t0 + t3 933 pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 934 935 mova m4, [o(pd_2048)] 936 paddd m6, m4 937 paddd m4, m1 ;t0 + t3 + 2048 938 paddd m5, m6 ;t1 + t3 + 2048 939 paddd m1, m6 940 paddd m1, m7 ;t0 + t1 - t3 + 2048 941 942 psrad m4, 12 ;out0 943 psrad m5, 12 ;out1 944 psrad m1, 12 ;out3 945 packssdw m3, m1 ;out3 946 packssdw m4, m5 ;low: out0 high: out1 947 948 punpckhqdq m1, m0, m4 ;out1 949 punpcklqdq m0, m4 ;out0 950 ret 951 952INV_TXFM_8X4_FN flipadst, dct 953INV_TXFM_8X4_FN flipadst, adst 954INV_TXFM_8X4_FN flipadst, flipadst 955INV_TXFM_8X4_FN flipadst, identity 956 957cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 958 mova m3, [o(pw_2896x8)] 959 pmulhrsw m0, m3, [coeffq+16*0] 960 pmulhrsw m1, m3, [coeffq+16*1] 961 pmulhrsw m2, m3, [coeffq+16*2] 962 pmulhrsw m3, [coeffq+16*3] 963 964 shufps m0, m0, q1032 965 shufps m1, m1, q1032 966 call m(iadst_4x8_internal_8bpc).main 967 968 punpckhwd m5, m3, m2 969 punpcklwd m3, m2 970 punpckhwd m2, m1, m0 971 punpcklwd m1, m0 972 973 pxor m0, m0 974 psubsw m4, m0, m2 975 psubsw m0, m5 976 punpckhdq m2, m0, m4 977 punpckldq m0, m4 978 punpckhdq m4, m3, m1 979 punpckldq m3, m1 980 punpckhwd m1, m0, m3 ;in1 981 punpcklwd m0, m3 ;in0 982 punpckhwd m3, m2, m4 ;in3 983 punpcklwd m2, m4 ;in2 984 jmp tx2q 985 986.pass2: 987 call m(iadst_8x4_internal_8bpc).main 988 mova m4, m0 989 mova m5, m1 990 mova m0, m3 991 mova m1, m2 992 mova m2, m5 993 mova m3, m4 994 jmp m(iadst_8x4_internal_8bpc).end 995 996INV_TXFM_8X4_FN identity, dct 997INV_TXFM_8X4_FN identity, adst 998INV_TXFM_8X4_FN identity, flipadst 999INV_TXFM_8X4_FN identity, identity 1000 1001cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1002 mova m3, [o(pw_2896x8)] 1003 pmulhrsw m0, m3, [coeffq+16*0] 1004 pmulhrsw m1, m3, [coeffq+16*1] 1005 pmulhrsw m2, m3, [coeffq+16*2] 1006 pmulhrsw m3, [coeffq+16*3] 1007 paddsw m0, m0 1008 paddsw m1, m1 1009 paddsw m2, m2 1010 paddsw m3, m3 1011 1012 punpckhwd m4, m0, m1 1013 punpcklwd m0, m1 1014 punpckhwd m1, m2, m3 1015 punpcklwd m2, m3 1016 punpckhdq m5, m4, m1 1017 punpckldq m4, m1 1018 punpckhdq m3, m0, m2 1019 punpckldq m0, m2 1020 punpckhwd m1, m0, m4 ;in1 1021 punpcklwd m0, m4 ;in0 1022 punpcklwd m2, m3, m5 ;in2 1023 punpckhwd m3, m5 ;in3 1024 jmp tx2q 1025 1026.pass2: 1027 mova m7, [o(pw_1697x8)] 1028 pmulhrsw m4, m7, m0 1029 pmulhrsw m5, m7, m1 1030 pmulhrsw m6, m7, m2 1031 pmulhrsw m7, m3 1032 paddsw m0, m4 1033 paddsw m1, m5 1034 paddsw m2, m6 1035 paddsw m3, m7 1036 jmp m(iadst_8x4_internal_8bpc).end 1037 1038%macro INV_TXFM_8X8_FN 2 ; type1, type2 1039 INV_TXFM_FN %1, %2, 8x8, 8, 16*4 1040%ifidn %1_%2, dct_dct 1041 pshuflw m0, [coeffq], q0000 1042 punpcklwd m0, m0 1043 mova m1, [o(pw_2896x8)] 1044 pmulhrsw m0, m1 1045 mova m2, [o(pw_16384)] 1046 mov [coeffq], eobd 1047 pmulhrsw m0, m2 1048 psrlw m2, 3 1049 pmulhrsw m0, m1 1050 pmulhrsw m0, m2 1051.end: 1052 mov r3d, 2 1053 lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)] 1054.loop: 1055 WRITE_8X4 0, 0, 0, 0, 1, 2, 3 1056 lea dstq, [dstq+strideq*2] 1057 dec r3d 1058 jg .loop 1059 jmp tx2q 1060.end3: 1061 RET 1062%endif 1063%endmacro 1064 1065%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 1066%if %3 1067 mova m7, [o(pw_2896x8)] 1068 pmulhrsw m0, m7, [%1+%2*0] 1069 pmulhrsw m1, m7, [%1+%2*1] 1070 pmulhrsw m2, m7, [%1+%2*2] 1071 pmulhrsw m3, m7, [%1+%2*3] 1072 pmulhrsw m4, m7, [%1+%2*4] 1073 pmulhrsw m5, m7, [%1+%2*5] 1074 pmulhrsw m6, m7, [%1+%2*6] 1075 pmulhrsw m7, [%1+%2*7] 1076%else 1077 mova m0, [%1+%2*0] 1078 mova m1, [%1+%2*1] 1079 mova m2, [%1+%2*2] 1080 mova m3, [%1+%2*3] 1081 mova m4, [%1+%2*4] 1082 mova m5, [%1+%2*5] 1083 mova m6, [%1+%2*6] 1084 mova m7, [%1+%2*7] 1085%endif 1086%endmacro 1087 1088%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048 1089 ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a 1090 ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a 1091 psubsw m%2, m%4, m%5 ;t6a 1092 paddsw m%4, m%5 ;t7 1093 psubsw m%5, m%1, m%3 ;t5a 1094 paddsw m%1, m%3 ;t4 1095 ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6 1096%endmacro 1097 1098INV_TXFM_8X8_FN dct, dct 1099INV_TXFM_8X8_FN dct, adst 1100INV_TXFM_8X8_FN dct, flipadst 1101INV_TXFM_8X8_FN dct, identity 1102 1103cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1104 LOAD_8ROWS coeffq, 16 1105 1106.pass1: 1107 call .main 1108 1109.pass1_end: 1110 mova m7, [o(pw_16384)] 1111 1112.pass1_end1: 1113 REPX {pmulhrsw x, m7}, m0, m2, m4, m6 1114 mova [rsp+gprsize+16*1], m6 1115 1116.pass1_end2: 1117 REPX {pmulhrsw x, m7}, m1, m3, m5 1118 pmulhrsw m7, [rsp+gprsize+16*0] 1119 1120cglobal_label .pass1_end3 1121 punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53 1122 punpckhwd m1, m5 ;14 54 15 55 16 56 17 57 1123 punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47 1124 punpcklwd m0, m4 ;00 40 01 41 02 42 03 43 1125 punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77 1126 punpcklwd m3, m7 ;30 70 31 71 32 72 33 73 1127 punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77 1128 punpcklwd m1, m4 ;14 34 54 74 15 35 55 75 1129 punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73 1130 punpcklwd m6, m3 ;10 30 50 70 11 31 51 71 1131 mova [rsp+gprsize+16*2], m6 1132 mova m6, [rsp+gprsize+16*1] 1133 punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67 1134 punpcklwd m2, m6 ;20 60 21 61 22 62 23 63 1135 punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67 1136 punpcklwd m5, m3 ;04 24 44 64 05 25 45 65 1137 punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63 1138 punpcklwd m0, m2 ;00 20 40 60 01 21 41 61 1139 1140 punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77 1141 punpcklwd m6, m7 ;06 16 26 36 46 56 66 76 1142 mova [rsp+gprsize+16*0], m2 1143 punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72 1144 punpckhwd m3, m4 ;03 13 23 33 43 53 63 73 1145 punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74 1146 punpckhwd m5, m1 ;05 15 25 35 45 55 65 75 1147 mova m7, [rsp+gprsize+16*2] 1148 punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71 1149 punpcklwd m0, m7 ;00 10 20 30 40 50 60 70 1150 mova m7, [rsp+gprsize+16*0] 1151 jmp tx2q 1152 1153.pass2: 1154 lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] 1155 1156.pass2_main: 1157 call .main 1158 1159.end: 1160 mova m7, [o(pw_2048)] 1161 REPX {pmulhrsw x, m7}, m0, m2, m4, m6 1162 mova [rsp+gprsize+16*1], m6 1163 1164.end2: 1165 REPX {pmulhrsw x, m7}, m1, m3, m5 1166 pmulhrsw m7, [rsp+gprsize+16*0] 1167 mova [rsp+gprsize+16*2], m5 1168 mova [rsp+gprsize+16*0], m7 1169 1170.end3: 1171 WRITE_8X4 0, 1, 2, 3, 5, 6, 7 1172 lea dstq, [dstq+strideq*2] 1173 WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7 1174 jmp tx2q 1175 1176.end4: 1177 pxor m7, m7 1178 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 1179 ret 1180 1181ALIGN function_align 1182cglobal_label .main 1183 mova [rsp+gprsize*2+16*0], m7 1184 mova [rsp+gprsize*2+16*1], m3 1185 mova [rsp+gprsize*2+16*2], m1 1186 mova m7, [o(pd_2048)] 1187 IDCT4_1D 0, 2, 4, 6, 1, 3, 7 1188 mova m3, [rsp+gprsize*2+16*2] 1189 mova [rsp+gprsize*2+16*2], m2 1190 mova m2, [rsp+gprsize*2+16*1] 1191 mova [rsp+gprsize*2+16*1], m4 1192 mova m4, [rsp+gprsize*2+16*0] 1193 mova [rsp+gprsize*2+16*0], m6 1194 IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7 1195 mova m6, [rsp+gprsize*2+16*0] 1196 psubsw m7, m0, m4 ;out7 1197 paddsw m0, m4 ;out0 1198 mova [rsp+gprsize*2+16*0], m7 1199 mova m1, [rsp+gprsize*2+16*2] 1200 psubsw m4, m6, m3 ;out4 1201 paddsw m3, m6 ;out3 1202 mova m7, [rsp+gprsize*2+16*1] 1203 psubsw m6, m1, m5 ;out6 1204 paddsw m1, m5 ;out1 1205 psubsw m5, m7, m2 ;out5 1206 paddsw m2, m7 ;out2 1207 ret 1208 1209 1210INV_TXFM_8X8_FN adst, dct 1211INV_TXFM_8X8_FN adst, adst 1212INV_TXFM_8X8_FN adst, flipadst 1213INV_TXFM_8X8_FN adst, identity 1214 1215cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1216 LOAD_8ROWS coeffq, 16 1217 1218.pass1: 1219 call .main 1220 call .main_pass1_end 1221 1222.pass1_end: 1223 mova m7, [o(pw_16384)] 1224 1225.pass1_end1: 1226 REPX {pmulhrsw x, m7}, m0, m2, m4, m6 1227 mova [rsp+gprsize+16*1], m6 1228 pxor m6, m6 1229 psubw m6, m7 1230 mova m7, m6 1231 jmp m(idct_8x8_internal_8bpc).pass1_end2 1232 1233ALIGN function_align 1234.pass2: 1235 lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] 1236 1237.pass2_main: 1238 call .main 1239 call .main_pass2_end 1240 1241.end: 1242 mova m7, [o(pw_2048)] 1243 REPX {pmulhrsw x, m7}, m0, m2, m4, m6 1244 mova [rsp+gprsize+16*1], m6 1245 pxor m6, m6 1246 psubw m6, m7 1247 mova m7, m6 1248 jmp m(idct_8x8_internal_8bpc).end2 1249 1250ALIGN function_align 1251cglobal_label .main 1252 mova [rsp+gprsize*2+16*0], m7 1253 mova [rsp+gprsize*2+16*1], m3 1254 mova [rsp+gprsize*2+16*2], m4 1255 mova m7, [o(pd_2048)] 1256 ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a 1257 ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a 1258 paddsw m3, m2, m6 ;t2 1259 psubsw m2, m6 ;t6 1260 paddsw m4, m5, m1 ;t3 1261 psubsw m5, m1 ;t7 1262 ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a 1263 1264 mova m6, [rsp+gprsize*2+16*2] 1265 mova [rsp+gprsize*2+16*2], m5 1266 mova m1, [rsp+gprsize*2+16*1] 1267 mova [rsp+gprsize*2+16*1], m2 1268 mova m5, [rsp+gprsize*2+16*0] 1269 mova [rsp+gprsize*2+16*0], m3 1270 ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a 1271 ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a 1272 psubsw m2, m0, m6 ;t4 1273 paddsw m0, m6 ;t0 1274 paddsw m3, m5, m1 ;t1 1275 psubsw m5, m1 ;t5 1276 ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a 1277 1278 mova m7, [rsp+gprsize*2+16*0] 1279 paddsw m1, m3, m4 ;-out7 1280 psubsw m3, m4 ;t3 1281 mova [rsp+gprsize*2+16*0], m1 1282 psubsw m4, m0, m7 ;t2 1283 paddsw m0, m7 ;out0 1284 mova m6, [rsp+gprsize*2+16*2] 1285 mova m7, [rsp+gprsize*2+16*1] 1286 paddsw m1, m5, m6 ;-out1 1287 psubsw m5, m6 ;t6 1288 paddsw m6, m2, m7 ;out6 1289 psubsw m2, m7 ;t7 1290 ret 1291ALIGN function_align 1292.main_pass1_end: 1293 mova [rsp+gprsize*2+16*1], m1 1294 mova [rsp+gprsize*2+16*2], m6 1295 punpckhwd m1, m4, m3 1296 punpcklwd m4, m3 1297 punpckhwd m7, m5, m2 1298 punpcklwd m5, m2 1299 mova m2, [o(pw_2896_2896)] 1300 mova m6, [o(pd_2048)] 1301 pmaddwd m3, m2, m7 1302 pmaddwd m2, m5 1303 paddd m3, m6 1304 paddd m2, m6 1305 psrad m3, 12 1306 psrad m2, 12 1307 packssdw m2, m3 ;out2 1308 mova m3, [o(pw_2896_m2896)] 1309 pmaddwd m7, m3 1310 pmaddwd m5, m3 1311 paddd m7, m6 1312 paddd m5, m6 1313 psrad m7, 12 1314 psrad m5, 12 1315 packssdw m5, m7 ;-out5 1316 mova m3, [o(pw_2896_2896)] 1317 pmaddwd m7, m3, m1 1318 pmaddwd m3, m4 1319 paddd m7, m6 1320 paddd m3, m6 1321 psrad m7, 12 1322 psrad m3, 12 1323 packssdw m3, m7 ;-out3 1324 mova m7, [o(pw_2896_m2896)] 1325 pmaddwd m1, m7 1326 pmaddwd m4, m7 1327 paddd m1, m6 1328 paddd m4, m6 1329 psrad m1, 12 1330 psrad m4, 12 1331 packssdw m4, m1 ;-out5 1332 mova m1, [rsp+gprsize*2+16*1] 1333 mova m6, [rsp+gprsize*2+16*2] 1334 ret 1335ALIGN function_align 1336cglobal_label .main_pass2_end 1337 paddsw m7, m4, m3 ;t2 + t3 1338 psubsw m4, m3 ;t2 - t3 1339 paddsw m3, m5, m2 ;t6 + t7 1340 psubsw m5, m2 ;t6 - t7 1341 mova m2, [o(pw_2896x8)] 1342 pmulhrsw m4, m2 ;out4 1343 pmulhrsw m5, m2 ;-out5 1344 pmulhrsw m7, m2 ;-out3 1345 pmulhrsw m2, m3 ;out2 1346 mova m3, m7 1347 ret 1348 1349INV_TXFM_8X8_FN flipadst, dct 1350INV_TXFM_8X8_FN flipadst, adst 1351INV_TXFM_8X8_FN flipadst, flipadst 1352INV_TXFM_8X8_FN flipadst, identity 1353 1354cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1355 LOAD_8ROWS coeffq, 16 1356 1357.pass1: 1358 call m(iadst_8x8_internal_8bpc).main 1359 call m(iadst_8x8_internal_8bpc).main_pass1_end 1360 1361.pass1_end: 1362 mova m7, [o(pw_m16384)] 1363 1364.pass1_end1: 1365 pmulhrsw m1, m7 1366 mova [rsp+gprsize+16*1], m1 1367 mova m1, m6 1368 mova m6, m2 1369 pmulhrsw m2, m5, m7 1370 mova m5, m6 1371 mova m6, m4 1372 pmulhrsw m4, m3, m7 1373 mova m3, m6 1374 mova m6, m0 1375 mova m0, m7 1376 pxor m7, m7 1377 psubw m7, m0 1378 pmulhrsw m0, [rsp+gprsize+16*0] 1379 REPX {pmulhrsw x, m7}, m1, m3, m5 1380 pmulhrsw m7, m6 1381 jmp m(idct_8x8_internal_8bpc).pass1_end3 1382 1383ALIGN function_align 1384.pass2: 1385 lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] 1386 1387.pass2_main: 1388 call m(iadst_8x8_internal_8bpc).main 1389 call m(iadst_8x8_internal_8bpc).main_pass2_end 1390 1391.end: 1392 mova m7, [o(pw_2048)] 1393 REPX {pmulhrsw x, m7}, m0, m2, m4, m6 1394 mova [rsp+gprsize+16*2], m2 1395 mova m2, m0 1396 pxor m0, m0 1397 psubw m0, m7 1398 mova m7, m2 1399 pmulhrsw m1, m0 1400 pmulhrsw m2, m5, m0 1401 mova [rsp+gprsize+16*1], m1 1402 mova m5, m4 1403 mova m1, m6 1404 pmulhrsw m4, m3, m0 1405 pmulhrsw m0, [rsp+gprsize+16*0] 1406 mova m3, m5 1407 mova [rsp+gprsize+16*0], m7 1408 jmp m(idct_8x8_internal_8bpc).end3 1409 1410INV_TXFM_8X8_FN identity, dct 1411INV_TXFM_8X8_FN identity, adst 1412INV_TXFM_8X8_FN identity, flipadst 1413INV_TXFM_8X8_FN identity, identity 1414 1415cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1416 LOAD_8ROWS coeffq, 16 1417 mova [rsp+gprsize+16*1], m6 1418 jmp m(idct_8x8_internal_8bpc).pass1_end3 1419 1420ALIGN function_align 1421.pass2: 1422 lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] 1423 1424.end: 1425 pmulhrsw m7, [o(pw_4096)] 1426 mova [rsp+gprsize+16*0], m7 1427 mova m7, [o(pw_4096)] 1428 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1429 mova [rsp+gprsize+16*2], m5 1430 mova [rsp+gprsize+16*1], m6 1431 jmp m(idct_8x8_internal_8bpc).end3 1432 1433 1434%macro INV_TXFM_4X16_FN 2 ; type1, type2 1435 INV_TXFM_FN %1, %2, 4x16, 8 1436%ifidn %1_%2, dct_dct 1437 pshuflw m0, [coeffq], q0000 1438 punpcklwd m0, m0 1439 mova m1, [o(pw_2896x8)] 1440 pmulhrsw m0, m1 1441 mov [coeffq], eobd 1442 pmulhrsw m0, [o(pw_16384)] 1443 pmulhrsw m0, m1 1444 pmulhrsw m0, [o(pw_2048)] 1445.end: 1446 WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 1447 lea dstq, [dstq+strideq*4] 1448 WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 1449 lea dstq, [dstq+strideq*4] 1450 WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 1451 lea dstq, [dstq+strideq*4] 1452 WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 1453 RET 1454%endif 1455%endmacro 1456 1457INV_TXFM_4X16_FN dct, dct 1458INV_TXFM_4X16_FN dct, adst 1459INV_TXFM_4X16_FN dct, flipadst 1460INV_TXFM_4X16_FN dct, identity 1461 1462cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1463 lea r3, [o(m(idct_4x8_internal_8bpc).pass1)] 1464 1465.pass1: 1466 mova m0, [coeffq+16*1] 1467 mova m1, [coeffq+16*3] 1468 mova m2, [coeffq+16*5] 1469 mova m3, [coeffq+16*7] 1470 push tx2q 1471 lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)] 1472 jmp r3 1473 1474.pass1_2: 1475 mova [coeffq+16*1], m0 1476 mova [coeffq+16*3], m1 1477 mova [coeffq+16*5], m2 1478 mova [coeffq+16*7], m3 1479 mova m0, [coeffq+16*0] 1480 mova m1, [coeffq+16*2] 1481 mova m2, [coeffq+16*4] 1482 mova m3, [coeffq+16*6] 1483 lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)] 1484 jmp r3 1485 1486.pass1_end: 1487 pop tx2q 1488 1489 mova m4, [coeffq+16*1] 1490 mova m5, [coeffq+16*3] 1491 mova m6, [coeffq+16*5] 1492 mova m7, [o(pw_16384)] 1493 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1494 1495 pmulhrsw m7, [coeffq+16*7] 1496 mova [coeffq+16*7], m7 1497 jmp tx2q 1498 1499.pass2: 1500 call m(idct_16x4_internal_8bpc).main 1501 1502.end: 1503 mova m7, [o(pw_2048)] 1504 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1505 pmulhrsw m7, [coeffq+16*7] 1506 mova [coeffq+16*4], m4 1507 1508.end1: 1509 mova [coeffq+16*5], m5 1510 mova [coeffq+16*6], m6 1511 mov r3, coeffq 1512 WRITE_4X8 0, 1, 3, 2 1513 1514 mova m0, [r3+16*4] 1515 mova m1, [r3+16*5] 1516 mova m2, [r3+16*6] 1517 mova m3, m7 1518 lea dstq, [dstq+strideq*4] 1519 WRITE_4X8 0, 1, 3, 2 1520 1521.end2: 1522 pxor m7, m7 1523 REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 1524 ret 1525 1526INV_TXFM_4X16_FN adst, dct 1527INV_TXFM_4X16_FN adst, adst 1528INV_TXFM_4X16_FN adst, flipadst 1529INV_TXFM_4X16_FN adst, identity 1530 1531cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1532 lea r3, [o(m(iadst_4x8_internal_8bpc).pass1)] 1533 jmp m(idct_4x16_internal_8bpc).pass1 1534 1535.pass2: 1536 call m(iadst_16x4_internal_8bpc).main 1537 call m(iadst_16x4_internal_8bpc).main_pass2_end 1538 1539 punpcklqdq m6, m5, m4 ;low: -out5 high: -out7 1540 punpckhqdq m4, m5 ;low: out8 high: out10 1541 punpcklqdq m5, m7, m2 ;low: out4 high: out6 1542 punpckhqdq m2, m7 ;low: -out9 high: -out11 1543 mova [coeffq+16*4], m2 1544 mova [coeffq+16*5], m6 1545 mova m2, [coeffq+16*6] 1546 mova m6, [coeffq+16*7] 1547 punpckhqdq m1, m6, m0 ;low: -out13 high: -out15 1548 punpcklqdq m0, m6 ;low: out0 high: out2 1549 punpckhqdq m6, m3, m2 ;low: out12 high: out14 1550 punpcklqdq m2, m3 ;low: -out1 high: -out3 1551 1552 mova m7, [o(pw_2048)] 1553 1554.end1: 1555 REPX {pmulhrsw x, m7}, m0, m5, m4, m6 1556 pxor m3, m3 1557 psubw m3, m7 1558 mova m7, [coeffq+16*4] 1559 REPX {pmulhrsw x, m3}, m2, m7, m1 1560 pmulhrsw m3, [coeffq+16*5] 1561 mova [coeffq+16*7], m5 1562 1563 punpckhqdq m5, m4, m7 ;low: out10 high: out11 1564 punpcklqdq m4, m7 ;low: out8 high: out9 1565 punpckhqdq m7, m6, m1 ;low: out14 high: out15 1566 punpcklqdq m6, m1 ;low: out12 high: out13 1567 punpckhqdq m1, m0, m2 ;low: out2 high: out3 1568 punpcklqdq m0, m2 ;low: out0 high: out1 1569 mova [coeffq+16*4], m4 1570 mova m4, [coeffq+16*7] 1571 punpcklqdq m2, m4, m3 ;low: out4 high: out5 1572 punpckhqdq m4, m3 ;low: out6 high: out7 1573 mova m3, m4 1574 1575.end2: 1576 mova [coeffq+16*5], m5 1577 mova [coeffq+16*6], m6 1578 mov r3, coeffq 1579 WRITE_4X8 0, 1, 2, 3 1580 1581 mova m0, [r3+16*4] 1582 mova m1, [r3+16*5] 1583 mova m2, [r3+16*6] 1584 mova m3, m7 1585 lea dstq, [dstq+strideq*4] 1586 WRITE_4X8 0, 1, 2, 3 1587 1588.end3: 1589 pxor m7, m7 1590 REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 1591 ret 1592 1593 1594INV_TXFM_4X16_FN flipadst, dct 1595INV_TXFM_4X16_FN flipadst, adst 1596INV_TXFM_4X16_FN flipadst, flipadst 1597INV_TXFM_4X16_FN flipadst, identity 1598 1599cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1600 lea r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)] 1601 jmp m(idct_4x16_internal_8bpc).pass1 1602 1603.pass2: 1604 call m(iadst_16x4_internal_8bpc).main 1605 call m(iadst_16x4_internal_8bpc).main_pass2_end 1606 1607 punpckhqdq m6, m5, m4 ;low: out5 high: out7 1608 punpcklqdq m4, m5 ;low: -out8 high: -out10 1609 punpckhqdq m5, m7, m2 ;low: -out4 high: -out6 1610 punpcklqdq m2, m7 ;low: out9 high: out11 1611 mova [coeffq+16*4], m2 1612 mova [coeffq+16*5], m6 1613 mova m2, [coeffq+16*6] 1614 mova m6, [coeffq+16*7] 1615 punpcklqdq m1, m6, m0 ;low: out13 high: out15 1616 punpckhqdq m0, m6 ;low: -out0 high: -out2 1617 punpcklqdq m6, m3, m2 ;low: -out12 high: -out14 1618 punpckhqdq m2, m3 ;low: out1 high: out3 1619 1620 mova m7, [o(pw_m2048)] 1621 jmp m(iadst_4x16_internal_8bpc).end1 1622 1623 1624INV_TXFM_4X16_FN identity, dct 1625INV_TXFM_4X16_FN identity, adst 1626INV_TXFM_4X16_FN identity, flipadst 1627INV_TXFM_4X16_FN identity, identity 1628 1629%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] 1630 pmulhrsw m%2, m%3, m%1 1631%if %0 == 4 ; if downshifting by 1 1632 pmulhrsw m%2, m%4 1633%else 1634 paddsw m%1, m%1 1635%endif 1636 paddsw m%1, m%2 1637%endmacro 1638 1639cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1640 mova m0, [coeffq+16*1] 1641 mova m6, [o(pw_1697x8)] 1642 mova m1, [coeffq+16*3] 1643 mova m2, [coeffq+16*5] 1644 mova m3, [coeffq+16*7] 1645 pcmpeqw m7, m7 1646 mov r3, tx2q 1647 lea tx2q, [o(.pass1_2)] 1648.pass1: 1649 pmulhrsw m4, m6, m0 1650 pmulhrsw m5, m6, m1 1651 pavgw m4, m0 1652 pcmpeqw m0, m7 1653 pavgw m5, m1 1654 pcmpeqw m1, m7 1655 pandn m0, m4 1656 pmulhrsw m4, m6, m2 1657 pandn m1, m5 1658 pmulhrsw m5, m6, m3 1659 pavgw m4, m2 1660 pcmpeqw m2, m7 1661 pavgw m5, m3 1662 pcmpeqw m3, m7 1663 pandn m2, m4 1664 pandn m3, m5 1665 jmp m(iadst_4x8_internal_8bpc).pass1_end 1666.pass1_2: 1667 mova [coeffq+16*1], m0 1668 mova [coeffq+16*3], m1 1669 mova [coeffq+16*5], m2 1670 mova [coeffq+16*7], m3 1671 mova m0, [coeffq+16*0] 1672 mova m1, [coeffq+16*2] 1673 mova m2, [coeffq+16*4] 1674 mova m3, [coeffq+16*6] 1675 lea tx2q, [o(.pass1_end)] 1676 jmp .pass1 1677.pass1_end: 1678 mova m4, [coeffq+16*1] 1679 mova m5, [coeffq+16*3] 1680 mova m6, [coeffq+16*5] 1681 jmp r3 1682.pass2: 1683 mova m7, [o(pw_1697x16)] 1684 mova [coeffq+16*6], m6 1685 REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 1686 mova m6, [coeffq+16*7] 1687 IDTX16 6, 7, 7 1688 mova [coeffq+16*7], m6 1689 mova m6, [coeffq+16*6] 1690 pmulhrsw m7, m6, [o(pw_1697x16)] 1691 paddsw m6, m6 1692 paddsw m6, m7 1693 mova m7, [o(pw_2048)] 1694 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1695 pmulhrsw m7, [coeffq+16*7] 1696 mova [coeffq+16*4], m4 1697 jmp m(iadst_4x16_internal_8bpc).end2 1698 1699 1700%macro INV_TXFM_16X4_FN 2 ; type1, type2 1701 INV_TXFM_FN %1, %2, 16x4, 8 1702%ifidn %1_%2, dct_dct 1703 movd m1, [o(pw_2896x8)] 1704 pmulhrsw m0, m1, [coeffq] 1705 movd m2, [o(pw_16384)] 1706 mov [coeffq], eobd 1707 mov r2d, 2 1708 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)] 1709.dconly: 1710 pmulhrsw m0, m2 1711 movd m2, [o(pw_2048)] ;intentionally rip-relative 1712 pmulhrsw m0, m1 1713 pmulhrsw m0, m2 1714 pshuflw m0, m0, q0000 1715 punpcklwd m0, m0 1716 pxor m5, m5 1717.dconly_loop: 1718 mova m1, [dstq] 1719 mova m3, [dstq+strideq] 1720 punpckhbw m2, m1, m5 1721 punpcklbw m1, m5 1722 punpckhbw m4, m3, m5 1723 punpcklbw m3, m5 1724 paddw m2, m0 1725 paddw m1, m0 1726 paddw m4, m0 1727 paddw m3, m0 1728 packuswb m1, m2 1729 packuswb m3, m4 1730 mova [dstq], m1 1731 mova [dstq+strideq], m3 1732 lea dstq, [dstq+strideq*2] 1733 dec r2d 1734 jg .dconly_loop 1735 jmp tx2q 1736.end: 1737 RET 1738%endif 1739%endmacro 1740 1741%macro LOAD_7ROWS 2 ;src, stride 1742 mova m0, [%1+%2*0] 1743 mova m1, [%1+%2*1] 1744 mova m2, [%1+%2*2] 1745 mova m3, [%1+%2*3] 1746 mova m4, [%1+%2*4] 1747 mova m5, [%1+%2*5] 1748 mova m6, [%1+%2*6] 1749%endmacro 1750 1751%macro SAVE_7ROWS 2 ;src, stride 1752 mova [%1+%2*0], m0 1753 mova [%1+%2*1], m1 1754 mova [%1+%2*2], m2 1755 mova [%1+%2*3], m3 1756 mova [%1+%2*4], m4 1757 mova [%1+%2*5], m5 1758 mova [%1+%2*6], m6 1759%endmacro 1760 1761%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3] 1762 punpckhwd m%5, m%4, m%1 ;packed in13 in3 1763 punpcklwd m%1, m%4 ;packed in1 in15 1764 punpcklwd m%4, m%3, m%2 ;packed in9 in7 1765 punpckhwd m%2, m%3 ;packed in5 in11 1766 mova m%7, [o(pd_2048)] 1767 ITX_MUL2X_PACK %1, %6, %7, 401, 4076, 1 ;low: t8a high: t15a 1768 ITX_MUL2X_PACK %4, %6, %7, 3166, 2598, 1 ;low: t9a high: t14a 1769 ITX_MUL2X_PACK %2, %6, %7, 1931, 3612, 1 ;low: t10a high: t13a 1770 ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a 1771 psubsw m%6, m%1, m%4 ;low: t9 high: t14 1772 paddsw m%1, m%4 ;low: t8 high: t15 1773 psubsw m%4, m%5, m%2 ;low: t10 high: t13 1774 paddsw m%5, m%2 ;low: t11 high: t12 1775 mova m%2, [o(deint_shuf2)] 1776 pshufb m%6, m%2 1777 pshufb m%4, m%2 1778 ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a 1779 ITX_MUL2X_PACK %4, %3, %7, m3784, 1567, 1 ;low: t10a high: t13a 1780 psubsw m%3, m%1, m%5 ;low: t11a high: t12a 1781 paddsw m%1, m%5 ;low: t8a high: t15a 1782 psubsw m%5, m%6, m%4 ;low: t10 high: t13 1783 paddsw m%6, m%4 ;low: t9 high: t14 1784 pshufb m%3, m%2 1785 pshufb m%5, m%2 1786 ITX_MUL2X_PACK %3, %2, %7, 2896, 2896, 4 ;t12, t11 1787 ITX_MUL2X_PACK %5, %4, %7, 2896, 2896, 4 ;t13a, t10a 1788 packssdw m%2, m%4 ;low: t11 high: t10a 1789 packssdw m%3, m%5 ;low: t12 high: t13a 1790 punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14 1791 punpcklqdq m%1, m%6 ;low: t8a high: t9 1792%endmacro 1793 1794INV_TXFM_16X4_FN dct, dct 1795INV_TXFM_16X4_FN dct, adst 1796INV_TXFM_16X4_FN dct, flipadst 1797INV_TXFM_16X4_FN dct, identity 1798 1799cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1800 LOAD_7ROWS coeffq, 16 1801 call .main 1802 1803.pass1_end: 1804 punpckhwd m7, m0, m2 ;packed out1, out5 1805 punpcklwd m0, m2 ;packed out0, out4 1806 punpcklwd m2, m1, m3 ;packed out3, out7 1807 punpckhwd m1, m3 ;packed out2, out6 1808 mova [coeffq+16*6], m7 1809 mova m7, [coeffq+16*7] 1810 punpckhwd m3, m4, m6 ;packed out9, out13 1811 punpcklwd m4, m6 ;packed out8, out12 1812 punpcklwd m6, m5, m7 ;packed out11, out15 1813 punpckhwd m5, m7 ;packed out10, out14 1814 1815.pass1_end2: 1816 mova m7, [o(pw_16384)] 1817 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1818 pmulhrsw m7, [coeffq+16*6] 1819 mova [coeffq+16*6], m7 1820 1821.pass1_end3: 1822 punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high 1823 punpcklwd m3, m6 ;packed 9, 10, 13, 15 low 1824 punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high 1825 punpcklwd m4, m5 ;packed 8, 10, 12, 14 low 1826 punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1) 1827 punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0) 1828 punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3) 1829 punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2) 1830 mova [coeffq+16*7], m3 1831 mova m3, [coeffq+16*6] 1832 punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high 1833 punpcklwd m3, m2 ;packed 1, 3, 5, 7 low 1834 punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high 1835 punpcklwd m0, m1 ;packed 0, 2, 4, 6 low 1836 punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1) 1837 punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0) 1838 punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3) 1839 punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2) 1840 jmp tx2q 1841 1842.pass2: 1843 lea tx2q, [o(m(idct_8x4_internal_8bpc).pass2)] 1844 1845.pass2_end: 1846 mova [coeffq+16*4], m4 1847 mova [coeffq+16*5], m5 1848 mova [coeffq+16*6], m6 1849 lea r3, [dstq+8] 1850 call tx2q 1851 1852 add coeffq, 16*4 1853 mova m0, [coeffq+16*0] 1854 mova m1, [coeffq+16*1] 1855 mova m2, [coeffq+16*2] 1856 mova m3, [coeffq+16*3] 1857 mov dstq, r3 1858 jmp tx2q 1859 1860ALIGN function_align 1861cglobal_label .main 1862 punpckhqdq m7, m0, m1 ;low:in1 high:in3 1863 punpcklqdq m0, m1 1864 punpcklqdq m1, m2, m3 1865 punpckhqdq m3, m2 ;low:in7 high:in5 1866 mova [coeffq+16*4], m7 1867 mova [coeffq+16*5], m3 1868 mova m7, [coeffq+16*7] 1869 punpcklqdq m2, m4, m5 1870 punpckhqdq m4, m5 ;low:in9 high:in11 1871 punpcklqdq m3, m6, m7 1872 punpckhqdq m7, m6 ;low:in15 high:in13 1873 mova [coeffq+16*6], m4 1874 IDCT8_1D_PACKED 1875 mova m6, [coeffq+16*4] 1876 mova m4, [coeffq+16*5] 1877 mova m5, [coeffq+16*6] 1878 mova [coeffq+16*4], m1 1879 mova [coeffq+16*5], m2 1880 mova [coeffq+16*6], m3 1881 1882 IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3 1883 1884 mova m1, [coeffq+16*4] 1885 psubsw m3, m0, m7 ;low:out15 high:out14 1886 paddsw m0, m7 ;low:out0 high:out1 1887 psubsw m7, m1, m5 ;low:out12 high:out13 1888 paddsw m1, m5 ;low:out3 high:out2 1889 mova [coeffq+16*7], m3 1890 mova m2, [coeffq+16*5] 1891 mova m3, [coeffq+16*6] 1892 psubsw m5, m2, m4 ;low:out11 high:out10 1893 paddsw m2, m4 ;low:out4 high:out5 1894 psubsw m4, m3, m6 ;low:out8 high:out9 1895 paddsw m3, m6 ;low:out7 high:out6 1896 mova m6, m7 1897 ret 1898 1899INV_TXFM_16X4_FN adst, dct 1900INV_TXFM_16X4_FN adst, adst 1901INV_TXFM_16X4_FN adst, flipadst 1902INV_TXFM_16X4_FN adst, identity 1903 1904cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1905 LOAD_7ROWS coeffq, 16 1906 call .main 1907 call .main_pass1_end 1908 1909 punpckhwd m6, m7, m0 ;packed -out11, -out15 1910 punpcklwd m0, m7 ;packed out0, out4 1911 punpcklwd m7, m3, m4 ;packed -out3, -out7 1912 punpckhwd m4, m3 ;packed out8, out12 1913 mova m1, [coeffq+16*6] 1914 punpcklwd m3, m1, m5 ;packed -out1, -out5 1915 punpckhwd m5, m1 ;packed out10, out14 1916 mova m1, [coeffq+16*7] 1917 mova [coeffq+16*6], m3 1918 mova [coeffq+16*7], m7 1919 punpckhwd m3, m2, m1 ;packed -out9, -out13 1920 punpcklwd m1, m2 ;packed out2, out6 1921 1922 mova m7, [o(pw_16384)] 1923 1924.pass1_end: 1925 REPX {pmulhrsw x, m7}, m0, m1, m4, m5 1926 pxor m2, m2 1927 psubw m2, m7 1928 mova m7, [coeffq+16*6] 1929 REPX {pmulhrsw x, m2}, m7, m3, m6 1930 pmulhrsw m2, [coeffq+16*7] 1931 mova [coeffq+16*6], m7 1932 jmp m(idct_16x4_internal_8bpc).pass1_end3 1933 1934.pass2: 1935 lea tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)] 1936 jmp m(idct_16x4_internal_8bpc).pass2_end 1937 1938ALIGN function_align 1939cglobal_label .main 1940 mova [coeffq+16*6], m0 1941 pshufd m0, m1, q1032 1942 pshufd m2, m2, q1032 1943 punpckhwd m1, m6, m0 ;packed in13, in2 1944 punpcklwd m0, m6 ;packed in3, in12 1945 punpckhwd m7, m5, m2 ;packed in11, in4 1946 punpcklwd m2, m5 ;packed in5, in10 1947 mova m6, [o(pd_2048)] 1948 ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3 1949 ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5 1950 ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11 1951 ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13 1952 psubsw m5, m1, m2 ;low:t10a high:t11a 1953 paddsw m1, m2 ;low:t2a high:t3a 1954 psubsw m2, m7, m0 ;low:t12a high:t13a 1955 paddsw m7, m0 ;low:t4a high:t5a 1956 punpcklqdq m0, m5 1957 punpckhwd m0, m5 ;packed t10a, t11a 1958 punpcklqdq m5, m2 1959 punpckhwd m2, m5 ;packed t13a, t12a 1960 ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11 1961 ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13 1962 mova [coeffq+16*4], m1 1963 mova [coeffq+16*5], m7 1964 mova m1, [coeffq+16*6] 1965 mova m7, [coeffq+16*7] 1966 pshufd m1, m1, q1032 1967 pshufd m3, m3, q1032 1968 punpckhwd m5, m7, m1 ;packed in15, in0 1969 punpcklwd m1, m7 ;packed in1, in14 1970 punpckhwd m7, m4, m3 ;packed in9, in6 1971 punpcklwd m3, m4 ;packed in7, in8 1972 ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1 1973 ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7 1974 ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9 1975 ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15 1976 psubsw m4, m5, m3 ;low:t8a high:t9a 1977 paddsw m5, m3 ;low:t0a high:t1a 1978 psubsw m3, m7, m1 ;low:t14a high:t15a 1979 paddsw m7, m1 ;low:t6a high:t7a 1980 punpcklqdq m1, m4 1981 punpckhwd m1, m4 ;packed t8a, t9a 1982 punpcklqdq m4, m3 1983 punpckhwd m3, m4 ;packed t15a, t14a 1984 ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9 1985 ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15 1986 paddsw m4, m1, m2 ;low:t12a high:t13a 1987 psubsw m1, m2 ;low:t8a high:t9a 1988 psubsw m2, m0, m3 ;low:t14a high:t15a 1989 paddsw m0, m3 ;low:t10a high:t11a 1990 punpcklqdq m3, m1 1991 punpckhwd m3, m1 ;packed t12a, t13a 1992 punpcklqdq m1, m2 1993 punpckhwd m2, m1 ;packed t15a, t14a 1994 ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13 1995 ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15 1996 psubsw m1, m3, m2 ;low:t14a high:t15a 1997 paddsw m3, m2 ;low:out2 high:-out13 1998 psubsw m2, m4, m0 ;low:t10 high:t11 1999 paddsw m0, m4 ;low:-out1 high:out14 2000 mova [coeffq+16*6], m0 2001 mova [coeffq+16*7], m3 2002 mova m0, [coeffq+16*4] 2003 mova m3, [coeffq+16*5] 2004 psubsw m4, m5, m3 ;low:t4 high:t5 2005 paddsw m5, m3 ;low:t0 high:t1 2006 psubsw m3, m0, m7 ;low:t6 high:t7 2007 paddsw m0, m7 ;low:t2 high:t3 2008 punpcklqdq m7, m4 2009 punpckhwd m7, m4 ;packed t4, t5 2010 punpcklqdq m4, m3 2011 punpckhwd m3, m4 ;packed t7, t6 2012 ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a 2013 ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a 2014 psubsw m4, m5, m0 ;low:t2a high:t3a 2015 paddsw m0, m5 ;low:out0 high:-out15 2016 psubsw m5, m7, m3 ;low:t6 high:t7 2017 paddsw m3, m7 ;low:-out3 high:out12 2018 ret 2019ALIGN function_align 2020.main_pass1_end: 2021 mova m7, [o(deint_shuf1)] 2022 mova [coeffq+16*4], m0 2023 mova [coeffq+16*5], m3 2024 mova m0, [o(pw_2896_m2896)] 2025 mova m3, [o(pw_2896_2896)] 2026 pshufb m1, m7 ;t14a t15a 2027 pshufb m2, m7 ;t10 t11 2028 pshufb m4, m7 ;t2a t3a 2029 pshufb m5, m7 ;t6 t7 2030 pmaddwd m7, m0, m2 2031 pmaddwd m2, m3 2032 paddd m7, m6 2033 paddd m2, m6 2034 psrad m7, 12 2035 psrad m2, 12 2036 packssdw m2, m7 ;low:out6 high:-out9 2037 pmaddwd m7, m0, m4 2038 pmaddwd m4, m3 2039 paddd m7, m6 2040 paddd m4, m6 2041 psrad m7, 12 2042 psrad m4, 12 2043 packssdw m4, m7 ;low:-out7 high:out8 2044 pmaddwd m7, m3, m5 2045 pmaddwd m5, m0 2046 paddd m7, m6 2047 paddd m5, m6 2048 psrad m7, 12 2049 psrad m5, 12 2050 packssdw m7, m5 ;low:out4 high:-out11 2051 pmaddwd m5, m3, m1 2052 pmaddwd m1, m0 2053 paddd m5, m6 2054 paddd m1, m6 2055 psrad m5, 12 2056 psrad m1, 12 2057 packssdw m5, m1 ;low:-out5 high:out10 2058 mova m0, [coeffq+16*4] 2059 mova m3, [coeffq+16*5] 2060 ret 2061ALIGN function_align 2062cglobal_label .main_pass2_end 2063 mova m7, [o(pw_2896x8)] 2064 punpckhqdq m6, m2, m1 ;low:t11 high:t15a 2065 punpcklqdq m2, m1 ;low:t10 high:t14a 2066 psubsw m1, m2, m6 2067 paddsw m2, m6 2068 punpckhqdq m6, m4, m5 ;low:t3a high:t7 2069 punpcklqdq m4, m5 ;low:t2a high:t6 2070 psubsw m5, m4, m6 2071 paddsw m4, m6 2072 pmulhrsw m1, m7 ;low:-out9 high:out10 2073 pmulhrsw m2, m7 ;low:out6 high:-out5 2074 pmulhrsw m5, m7 ;low:out8 high:-out11 2075 pmulhrsw m4, m7 ;low:-out7 high:out4 2076 punpckhqdq m7, m4, m5 ;low:out4 high:-out11 2077 punpcklqdq m4, m5 ;low:-out7 high:out8 2078 punpckhqdq m5, m2, m1 ;low:-out5 high:out10 2079 punpcklqdq m2, m1 ;low:out6 high:-out9 2080 ret 2081 2082 2083INV_TXFM_16X4_FN flipadst, dct 2084INV_TXFM_16X4_FN flipadst, adst 2085INV_TXFM_16X4_FN flipadst, flipadst 2086INV_TXFM_16X4_FN flipadst, identity 2087 2088cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2089 LOAD_7ROWS coeffq, 16 2090 call m(iadst_16x4_internal_8bpc).main 2091 call m(iadst_16x4_internal_8bpc).main_pass1_end 2092 2093 punpcklwd m6, m7, m0 ;packed out11, out15 2094 punpckhwd m0, m7 ;packed -out0, -out4 2095 punpckhwd m7, m3, m4 ;packed out3, out7 2096 punpcklwd m4, m3 ;packed -out8, -out12 2097 mova m1, [coeffq+16*6] 2098 punpckhwd m3, m1, m5 ;packed out1, out5 2099 punpcklwd m5, m1 ;packed -out10, -out14 2100 mova m1, [coeffq+16*7] 2101 mova [coeffq+16*6], m3 2102 mova [coeffq+16*7], m7 2103 punpcklwd m3, m2, m1 ;packed out9, out13 2104 punpckhwd m1, m2 ;packed -out2, -out6 2105 2106 mova m7, [o(pw_m16384)] 2107 jmp m(iadst_16x4_internal_8bpc).pass1_end 2108 2109.pass2: 2110 lea tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)] 2111 jmp m(idct_16x4_internal_8bpc).pass2_end 2112 2113 2114INV_TXFM_16X4_FN identity, dct 2115INV_TXFM_16X4_FN identity, adst 2116INV_TXFM_16X4_FN identity, flipadst 2117INV_TXFM_16X4_FN identity, identity 2118 2119cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2120 mova m1, [coeffq+16*6] 2121 mova m0, [coeffq+16*5] 2122 mova m2, [coeffq+16*7] 2123 mova m6, [o(pw_1697x16)] 2124 mova m7, [o(pw_16384)] 2125 pmulhrsw m4, m6, m1 2126 pmulhrsw m3, m6, m0 2127 pmulhrsw m5, m6, m2 2128 pmulhrsw m4, m7 2129 pmulhrsw m3, m7 2130 pmulhrsw m5, m7 2131 paddsw m1, m4 2132 paddsw m0, m3 2133 paddsw m5, m2 2134 mova m2, [coeffq+16*2] 2135 mova m3, [coeffq+16*3] 2136 mova m4, [coeffq+16*4] 2137 mova [coeffq+16*6], m1 2138 mova [coeffq+16*5], m0 2139 mova [coeffq+16*7], m5 2140 pmulhrsw m0, m6, m2 2141 pmulhrsw m1, m6, m3 2142 pmulhrsw m5, m6, m4 2143 pmulhrsw m0, m7 2144 pmulhrsw m1, m7 2145 pmulhrsw m5, m7 2146 paddsw m2, m0 2147 paddsw m3, m1 2148 paddsw m4, m5 2149 mova m0, [coeffq+16*0] 2150 mova m1, [coeffq+16*1] 2151 pmulhrsw m5, m6, m0 2152 pmulhrsw m6, m1 2153 pmulhrsw m5, m7 2154 pmulhrsw m6, m7 2155 paddsw m0, m5 2156 paddsw m1, m6 2157 mova m6, [coeffq+16*6] 2158 mova m5, [coeffq+16*5] 2159 punpckhwd m7, m0, m2 ;packed out1, out5 2160 punpcklwd m0, m2 ;packed out0, out4 2161 punpckhwd m2, m1, m3 ;packed out3, out7 2162 punpcklwd m1, m3 ;packed out2, out6 2163 mova [coeffq+16*6], m7 2164 mova m7, [coeffq+16*7] 2165 punpckhwd m3, m4, m6 ;packed out9, out13 2166 punpcklwd m4, m6 ;packed out8, out12 2167 punpckhwd m6, m5, m7 ;packed out11, out15 2168 punpcklwd m5, m7 ;packed out10, out14 2169 jmp m(idct_16x4_internal_8bpc).pass1_end3 2170 2171.pass2: 2172 lea tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)] 2173 jmp m(idct_16x4_internal_8bpc).pass2_end 2174 2175 2176%macro SAVE_8ROWS 2 ;src, stride 2177 mova [%1+%2*0], m0 2178 mova [%1+%2*1], m1 2179 mova [%1+%2*2], m2 2180 mova [%1+%2*3], m3 2181 mova [%1+%2*4], m4 2182 mova [%1+%2*5], m5 2183 mova [%1+%2*6], m6 2184 mova [%1+%2*7], m7 2185%endmacro 2186 2187%macro INV_TXFM_8X16_FN 2 ; type1, type2 2188 INV_TXFM_FN %1, %2, 8x16, 8, 16*16 2189%ifidn %1_%2, dct_dct 2190 pshuflw m0, [coeffq], q0000 2191 punpcklwd m0, m0 2192 mova m1, [o(pw_2896x8)] 2193 pmulhrsw m0, m1 2194 mova m2, [o(pw_16384)] 2195 mov [coeffq], eobd 2196 pmulhrsw m0, m1 2197 pmulhrsw m0, m2 2198 psrlw m2, 3 ; pw_2048 2199 pmulhrsw m0, m1 2200 pmulhrsw m0, m2 2201 mov r3d, 4 2202 lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)] 2203 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop 2204.end: 2205 RET 2206%endif 2207%endmacro 2208 2209INV_TXFM_8X16_FN dct, dct 2210INV_TXFM_8X16_FN dct, adst 2211INV_TXFM_8X16_FN dct, flipadst 2212INV_TXFM_8X16_FN dct, identity 2213 2214cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2215 lea r3, [o(m(idct_8x8_internal_8bpc).pass1)] 2216 2217.pass1: 2218 LOAD_8ROWS coeffq+16*1, 32, 1 2219 mov [rsp+gprsize+16*11], tx2q 2220 lea tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)] 2221 jmp r3 2222 2223.pass1_end: 2224 SAVE_8ROWS coeffq+16*1, 32 2225 LOAD_8ROWS coeffq+16*0, 32, 1 2226 mov tx2q, [rsp+gprsize+16*11] 2227 jmp r3 2228 2229.pass2: 2230 lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] 2231 2232.pass2_pre: 2233 mova [coeffq+16*2 ], m1 2234 mova [coeffq+16*6 ], m3 2235 mova [coeffq+16*10], m5 2236 mova [coeffq+16*14], m7 2237 mova m1, m2 2238 mova m2, m4 2239 mova m3, m6 2240 mova m4, [coeffq+16*1 ] 2241 mova m5, [coeffq+16*5 ] 2242 mova m6, [coeffq+16*9 ] 2243 mova m7, [coeffq+16*13] 2244 2245.pass2_main: 2246 call m(idct_8x8_internal_8bpc).main 2247 2248 SAVE_7ROWS rsp+gprsize+16*3, 16 2249 mova m0, [coeffq+16*2 ] 2250 mova m1, [coeffq+16*6 ] 2251 mova m2, [coeffq+16*10] 2252 mova m3, [coeffq+16*14] 2253 mova m4, [coeffq+16*3 ] 2254 mova m5, [coeffq+16*7 ] 2255 mova m6, [coeffq+16*11] 2256 mova m7, [coeffq+16*15] 2257 call m(idct_16x8_internal_8bpc).main 2258 2259 mov r3, dstq 2260 lea dstq, [dstq+strideq*8] 2261 jmp m(idct_8x8_internal_8bpc).end 2262 2263.end: 2264 LOAD_8ROWS rsp+gprsize+16*3, 16 2265 mova [rsp+gprsize+16*0], m7 2266 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 2267 mov dstq, r3 2268 jmp m(idct_8x8_internal_8bpc).end 2269 2270.end1: 2271 pxor m7, m7 2272 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 2273 ret 2274 2275INV_TXFM_8X16_FN adst, dct 2276INV_TXFM_8X16_FN adst, adst 2277INV_TXFM_8X16_FN adst, flipadst 2278INV_TXFM_8X16_FN adst, identity 2279 2280cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2281 lea r3, [o(m(iadst_8x8_internal_8bpc).pass1)] 2282 jmp m(idct_8x16_internal_8bpc).pass1 2283 2284.pass2: 2285 lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] 2286 2287.pass2_pre: 2288 mova [rsp+gprsize+16*7], m0 2289 mova [rsp+gprsize+16*8], m1 2290 mova [rsp+gprsize+16*5], m6 2291 mova [rsp+gprsize+16*6], m7 2292 mova m0, m2 2293 mova m1, m3 2294 mova m2, m4 2295 mova m3, m5 2296 2297.pass2_main: 2298 mova m4, [coeffq+16*1 ] 2299 mova m5, [coeffq+16*3 ] 2300 mova m6, [coeffq+16*13] 2301 mova m7, [coeffq+16*15] 2302 mova [rsp+gprsize+16*3], m4 2303 mova [rsp+gprsize+16*4], m5 2304 mova [rsp+gprsize+16*9], m6 2305 mova [rsp+gprsize+32*5], m7 2306 mova m4, [coeffq+16*5 ] 2307 mova m5, [coeffq+16*7 ] 2308 mova m6, [coeffq+16*9 ] 2309 mova m7, [coeffq+16*11] 2310 2311 call m(iadst_16x8_internal_8bpc).main 2312 call m(iadst_16x8_internal_8bpc).main_pass2_end 2313 2314 mov r3, dstq 2315 lea dstq, [dstq+strideq*8] 2316 jmp m(iadst_8x8_internal_8bpc).end 2317 2318.end: 2319 LOAD_8ROWS rsp+gprsize+16*3, 16 2320 mova [rsp+gprsize+16*0], m7 2321 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 2322 mov dstq, r3 2323 jmp m(iadst_8x8_internal_8bpc).end 2324 2325 2326INV_TXFM_8X16_FN flipadst, dct 2327INV_TXFM_8X16_FN flipadst, adst 2328INV_TXFM_8X16_FN flipadst, flipadst 2329INV_TXFM_8X16_FN flipadst, identity 2330 2331cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2332 lea r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)] 2333 jmp m(idct_8x16_internal_8bpc).pass1 2334 2335.pass2: 2336 lea tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)] 2337 lea r3, [dstq+strideq*8] 2338 2339.pass2_pre: 2340 mova [rsp+gprsize+16*7], m0 2341 mova [rsp+gprsize+16*8], m1 2342 mova [rsp+gprsize+16*5], m6 2343 mova [rsp+gprsize+16*6], m7 2344 mova m0, m2 2345 mova m1, m3 2346 mova m2, m4 2347 mova m3, m5 2348 2349.pass2_main: 2350 mova m4, [coeffq+16*1 ] 2351 mova m5, [coeffq+16*3 ] 2352 mova m6, [coeffq+16*13] 2353 mova m7, [coeffq+16*15] 2354 mova [rsp+gprsize+16*3], m4 2355 mova [rsp+gprsize+16*4], m5 2356 mova [rsp+gprsize+16*9], m6 2357 mova [rsp+gprsize+32*5], m7 2358 mova m4, [coeffq+16*5 ] 2359 mova m5, [coeffq+16*7 ] 2360 mova m6, [coeffq+16*9 ] 2361 mova m7, [coeffq+16*11] 2362 2363 call m(iadst_16x8_internal_8bpc).main 2364 call m(iadst_16x8_internal_8bpc).main_pass2_end 2365 jmp m(iflipadst_8x8_internal_8bpc).end 2366 2367.end: 2368 LOAD_8ROWS rsp+gprsize+16*3, 16 2369 mova [rsp+gprsize+16*0], m7 2370 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 2371 mov dstq, r3 2372 jmp m(iflipadst_8x8_internal_8bpc).end 2373 2374 2375INV_TXFM_8X16_FN identity, dct 2376INV_TXFM_8X16_FN identity, adst 2377INV_TXFM_8X16_FN identity, flipadst 2378INV_TXFM_8X16_FN identity, identity 2379 2380cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2381 LOAD_8ROWS coeffq+16*1, 32, 1 2382 mov r3, tx2q 2383 lea tx2q, [o(.pass1_end)] 2384 mova [rsp+gprsize+16*1], m6 2385 jmp m(idct_8x8_internal_8bpc).pass1_end3 2386 2387.pass1_end: 2388 SAVE_8ROWS coeffq+16*1, 32 2389 LOAD_8ROWS coeffq+16*0, 32, 1 2390 mov tx2q, r3 2391 mova [rsp+gprsize+16*1], m6 2392 jmp m(idct_8x8_internal_8bpc).pass1_end3 2393 2394.pass2: 2395 lea tx2q, [o(.end1)] 2396 2397.end: 2398 mova [rsp+gprsize+16*0], m7 2399 mova [rsp+gprsize+16*1], m6 2400 mova m7, [o(pw_1697x16)] 2401 REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 2402 mova m6, [rsp+gprsize+16*1] 2403 mova [rsp+gprsize+16*2], m5 2404 IDTX16 6, 5, 7 2405 mova m5, [rsp+gprsize+16*0] 2406 IDTX16 5, 7, 7 2407 mova m7, [o(pw_2048)] 2408 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 2409 pmulhrsw m7, [rsp+gprsize+16*2] 2410 mova [rsp+gprsize+16*0], m5 2411 mova [rsp+gprsize+16*1], m6 2412 mova [rsp+gprsize+16*2], m7 2413 jmp m(idct_8x8_internal_8bpc).end3 2414 2415.end1: 2416 LOAD_8ROWS coeffq+16*1, 32 2417 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 2418 lea dstq, [dstq+strideq*2] 2419 jmp .end 2420 2421 2422%macro INV_TXFM_16X8_FN 2 ; type1, type2 2423 INV_TXFM_FN %1, %2, 16x8, 8, 16*16 2424%ifidn %1_%2, dct_dct 2425 movd m1, [o(pw_2896x8)] 2426 pmulhrsw m0, m1, [coeffq] 2427 movd m2, [o(pw_16384)] 2428 mov [coeffq], eobd 2429 pmulhrsw m0, m1 2430 mov r2d, 4 2431 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)] 2432 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly 2433.end: 2434 RET 2435%endif 2436%endmacro 2437 2438INV_TXFM_16X8_FN dct, dct 2439INV_TXFM_16X8_FN dct, adst 2440INV_TXFM_16X8_FN dct, flipadst 2441INV_TXFM_16X8_FN dct, identity 2442 2443cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2444 LOAD_8ROWS coeffq+16*0, 32, 1 2445 call m(idct_8x8_internal_8bpc).main 2446 SAVE_7ROWS rsp+gprsize+16*3, 16 2447 2448 LOAD_8ROWS coeffq+16*1, 32, 1 2449 call .main 2450 mov r3, tx2q 2451 lea tx2q, [o(.pass1_end)] 2452 jmp m(idct_8x8_internal_8bpc).pass1_end 2453 2454.pass1_end: 2455 SAVE_8ROWS coeffq+16*1, 32 2456 LOAD_8ROWS rsp+gprsize+16*3, 16 2457 mova [rsp+gprsize+16*0], m7 2458 mov tx2q, r3 2459 jmp m(idct_8x8_internal_8bpc).pass1_end 2460 2461.pass2: 2462 lea tx2q, [o(.end)] 2463 lea r3, [dstq+8] 2464 jmp m(idct_8x8_internal_8bpc).pass2_main 2465 2466.end: 2467 LOAD_8ROWS coeffq+16*1, 32 2468 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 2469 mov dstq, r3 2470 jmp m(idct_8x8_internal_8bpc).pass2_main 2471 2472 2473ALIGN function_align 2474cglobal_label .main 2475 mova [rsp+gprsize*2+16*1], m2 2476 mova [rsp+gprsize*2+16*2], m6 2477 mova [rsp+gprsize*2+32*5], m5 2478 2479 mova m6, [o(pd_2048)] 2480 ITX_MULSUB_2W 0, 7, 2, 5, 6, 401, 4076 ;t8a, t15a 2481 ITX_MULSUB_2W 4, 3, 2, 5, 6, 3166, 2598 ;t9a, t14a 2482 psubsw m2, m0, m4 ;t9 2483 paddsw m0, m4 ;t8 2484 psubsw m4, m7, m3 ;t14 2485 paddsw m7, m3 ;t15 2486 ITX_MULSUB_2W 4, 2, 3, 5, 6, 1567, 3784 ;t9a, t14a 2487 mova m3, [rsp+gprsize*2+16*1] 2488 mova m5, [rsp+gprsize*2+32*5] 2489 mova [rsp+gprsize*2+16*1], m2 2490 mova [rsp+gprsize*2+32*5], m4 2491 mova m2, [rsp+gprsize*2+16*2] 2492 mova [rsp+gprsize*2+16*2], m7 2493 ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a 2494 ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a 2495 psubsw m4, m2, m3 ;t10 2496 paddsw m2, m3 ;t11 2497 psubsw m3, m1, m5 ;t13 2498 paddsw m1, m5 ;t12 2499 ITX_MULSUB_2W 3, 4, 7, 5, 6, m3784, 1567 ;t10a, t13a 2500 mova m7, [rsp+gprsize*2+32*5] 2501 psubsw m6, m0, m2 ;t11a 2502 paddsw m0, m2 ;t8a 2503 paddsw m2, m7, m3 ;t9 2504 psubsw m7, m3 ;t10 2505 mova m5, [rsp+gprsize*2+16*0] 2506 psubsw m3, m5, m0 ;out8 2507 paddsw m0, m5 ;out7 2508 mova [rsp+gprsize*2+32*5], m0 2509 mova m5, [rsp+gprsize*2+16*9] 2510 psubsw m0, m5, m2 ;out9 2511 paddsw m2, m5 ;out6 2512 mova [rsp+gprsize*2+16*0], m0 2513 mova [rsp+gprsize*2+16*9], m2 2514 mova m0, [rsp+gprsize*2+16*1] 2515 mova m2, [rsp+gprsize*2+16*2] 2516 mova [rsp+gprsize*2+16*1], m3 2517 psubsw m5, m0, m4 ;t13 2518 paddsw m0, m4 ;t14 2519 mova m3, [o(pd_2048)] 2520 psubsw m4, m2, m1 ;t12a 2521 paddsw m1, m2 ;t15a 2522 mova [rsp+gprsize*2+16*2], m1 2523 ITX_MULSUB_2W 5, 7, 1, 2, 3, 2896, 2896 ;t10a, t13a 2524 ITX_MULSUB_2W 4, 6, 1, 2, 3, 2896, 2896 ;t11, t12 2525 mova m3, [rsp+gprsize*2+16*8] 2526 psubsw m2, m3, m5 ;out10 2527 paddsw m3, m5 ;out5 2528 mova m5, [rsp+gprsize*2+16*7] 2529 mova [rsp+gprsize*2+16*8], m3 2530 psubsw m3, m5, m4 ;out11 2531 paddsw m5, m4 ;out4 2532 mova m4, [rsp+gprsize*2+16*6] 2533 mova [rsp+gprsize*2+16*7], m5 2534 paddsw m5, m4, m6 ;out3 2535 psubsw m4, m6 ;out12 2536 mova m6, [rsp+gprsize*2+16*5] 2537 mova [rsp+gprsize*2+16*6], m5 2538 psubsw m5, m6, m7 ;out13 2539 paddsw m6, m7 ;out2 2540 mova m7, [rsp+gprsize*2+16*4] 2541 mova [rsp+gprsize*2+16*5], m6 2542 psubsw m6, m7, m0 ;out14 2543 paddsw m7, m0 ;out1 2544 mova m1, [rsp+gprsize*2+16*2] 2545 mova m0, [rsp+gprsize*2+16*3] 2546 mova [rsp+gprsize*2+16*4], m7 2547 psubsw m7, m0, m1 ;out15 2548 paddsw m0, m1 ;out0 2549 mova [rsp+gprsize*2+16*3], m0 2550 mova m1, [rsp+gprsize*2+16*0] 2551 mova m0, [rsp+gprsize*2+16*1] 2552 mova [rsp+gprsize*2+16*0], m7 2553 ret 2554 2555INV_TXFM_16X8_FN adst, dct 2556INV_TXFM_16X8_FN adst, adst 2557INV_TXFM_16X8_FN adst, flipadst 2558INV_TXFM_16X8_FN adst, identity 2559 2560cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2561 mova m7, [o(pw_2896x8)] 2562 pmulhrsw m0, m7, [coeffq+16*0 ] 2563 pmulhrsw m1, m7, [coeffq+16*1 ] 2564 pmulhrsw m2, m7, [coeffq+16*14] 2565 pmulhrsw m3, m7, [coeffq+16*15] 2566 mova [rsp+gprsize+16*7], m0 2567 mova [rsp+gprsize+16*8], m1 2568 mova [rsp+gprsize+16*9], m2 2569 mova [rsp+gprsize+32*5], m3 2570 pmulhrsw m0, m7, [coeffq+16*6 ] 2571 pmulhrsw m1, m7, [coeffq+16*7 ] 2572 pmulhrsw m2, m7, [coeffq+16*8 ] 2573 pmulhrsw m3, m7, [coeffq+16*9 ] 2574 mova [rsp+gprsize+16*3], m2 2575 mova [rsp+gprsize+16*4], m3 2576 mova [rsp+gprsize+16*5], m0 2577 mova [rsp+gprsize+16*6], m1 2578 pmulhrsw m0, m7, [coeffq+16*2 ] 2579 pmulhrsw m1, m7, [coeffq+16*3 ] 2580 pmulhrsw m2, m7, [coeffq+16*4 ] 2581 pmulhrsw m3, m7, [coeffq+16*5 ] 2582 pmulhrsw m4, m7, [coeffq+16*10] 2583 pmulhrsw m5, m7, [coeffq+16*11] 2584 pmulhrsw m6, m7, [coeffq+16*12] 2585 pmulhrsw m7, [coeffq+16*13] 2586 2587 call .main 2588 call .main_pass1_end 2589 mov r3, tx2q 2590 lea tx2q, [o(.pass1_end)] 2591 jmp m(iadst_8x8_internal_8bpc).pass1_end 2592 2593.pass1_end: 2594 SAVE_8ROWS coeffq+16*1, 32 2595 LOAD_8ROWS rsp+gprsize+16*3, 16 2596 mova [rsp+gprsize+16*0], m7 2597 mov tx2q, r3 2598 jmp m(iadst_8x8_internal_8bpc).pass1_end 2599 2600.pass2: 2601 lea tx2q, [o(.end)] 2602 lea r3, [dstq+8] 2603 jmp m(iadst_8x8_internal_8bpc).pass2_main 2604 2605.end: 2606 LOAD_8ROWS coeffq+16*1, 32 2607 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 2608 mov dstq, r3 2609 jmp m(iadst_8x8_internal_8bpc).pass2_main 2610 2611ALIGN function_align 2612cglobal_label .main 2613 mova [rsp+gprsize*2+16*0], m1 2614 mova [rsp+gprsize*2+16*1], m2 2615 mova [rsp+gprsize*2+16*2], m6 2616 2617 mova m6, [o(pd_2048)] 2618 ITX_MULSUB_2W 7, 0, 1, 2, 6, 995, 3973 ;t3, t2 2619 ITX_MULSUB_2W 3, 4, 1, 2, 6, 3513, 2106 ;t11, t10 2620 psubsw m1, m0, m4 ;t10a 2621 paddsw m0, m4 ;t2a 2622 psubsw m4, m7, m3 ;t11a 2623 paddsw m3, m7 ;t3a 2624 ITX_MULSUB_2W 1, 4, 7, 2, 6, 3406, 2276 ;t11, t10 2625 mova m2, [rsp+gprsize*2+16*0] ;in3 2626 mova m7, [rsp+gprsize*2+16*1] ;in4 2627 mova [rsp+gprsize*2+16*0], m1 ;t11 2628 mova [rsp+gprsize*2+16*1], m4 ;t10 2629 mova m1, [rsp+gprsize*2+16*2] ;in12 2630 mova [rsp+gprsize*2+16*2], m0 ;t2a 2631 ITX_MULSUB_2W 5, 7, 0, 4, 6, 1751, 3703 ;t5, t4 2632 ITX_MULSUB_2W 2, 1, 0, 4, 6, 3857, 1380 ;t13, t12 2633 psubsw m0, m7, m1 ;t12a 2634 paddsw m1, m7 ;t4a 2635 psubsw m4, m5, m2 ;t13a 2636 paddsw m5, m2 ;t5a 2637 ITX_MULSUB_2W 4, 0, 7, 2, 6, 4017, 799 ;t12, t13 2638 mova m2, [rsp+gprsize*2+16*8] ;in1 2639 mova m7, [rsp+gprsize*2+16*9] ;in14 2640 mova [rsp+gprsize*2+16*8], m4 ;t12 2641 mova [rsp+gprsize*2+16*9], m0 ;t13 2642 mova m4, [rsp+gprsize*2+16*4] ;in9 2643 mova m0, [rsp+gprsize*2+16*5] ;in6 2644 mova [rsp+gprsize*2+16*4], m1 ;t4a 2645 mova [rsp+gprsize*2+16*5], m5 ;t5a 2646 ITX_MULSUB_2W 2, 7, 1, 5, 6, 4052, 601 ;t15, t14 2647 ITX_MULSUB_2W 4, 0, 1, 5, 6, 2440, 3290 ;t7, t6 2648 psubsw m1, m0, m7 ;t14a 2649 paddsw m0, m7 ;t6a 2650 psubsw m5, m4, m2 ;t15a 2651 paddsw m4, m2 ;t7a 2652 ITX_MULSUB_2W 5, 1, 7, 2, 6, 2276, 3406 ;t14, t15 2653 mova m2, [rsp+gprsize*2+16*2] ;t2a 2654 mova [rsp+gprsize*2+16*2], m5 ;t14 2655 psubsw m7, m2, m0 ;t6 2656 paddsw m2, m0 ;t2 2657 psubsw m0, m3, m4 ;t7 2658 paddsw m3, m4 ;t3 2659 ITX_MULSUB_2W 0, 7, 4, 5, 6, 3784, 1567 ;t6a, t7a 2660 mova m4, [rsp+gprsize*2+16*7] ;in0 2661 mova m5, [rsp+gprsize*2+32*5] ;in15 2662 mova [rsp+gprsize*2+16*7], m3 ;t3 2663 mova [rsp+gprsize*2+32*5], m1 ;t15 2664 mova m1, [rsp+gprsize*2+16*6] ;in7 2665 mova m3, [rsp+gprsize*2+16*3] ;in8 2666 mova [rsp+gprsize*2+16*6], m7 ;t7a 2667 mova [rsp+gprsize*2+16*3], m0 ;t6a 2668 ITX_MULSUB_2W 5, 4, 0, 7, 6, 201, 4091 ;t1, t0 2669 ITX_MULSUB_2W 1, 3, 0, 7, 6, 3035, 2751 ;t9, t8 2670 psubsw m0, m4, m3 ;t8a 2671 paddsw m4, m3 ;t0a 2672 psubsw m3, m5, m1 ;t9a 2673 paddsw m5, m1 ;t1a 2674 ITX_MULSUB_2W 0, 3, 1, 7, 6, 799, 4017 ;t9, t8 2675 mova m1, [rsp+gprsize*2+16*4] ;t4a 2676 mova m7, [rsp+gprsize*2+16*5] ;t5a 2677 mova [rsp+gprsize*2+16*4], m3 ;t8 2678 mova [rsp+gprsize*2+16*5], m0 ;t9 2679 psubsw m0, m4, m1 ;t4 2680 paddsw m4, m1 ;t0 2681 psubsw m3, m5, m7 ;t5 2682 paddsw m5, m7 ;t1 2683 ITX_MULSUB_2W 0, 3, 1, 7, 6, 1567, 3784 ;t5a, t4a 2684 mova m7, [rsp+gprsize*2+16*3] ;t6a 2685 psubsw m1, m4, m2 ;t2a 2686 paddsw m4, m2 ;out0 2687 mova [rsp+gprsize*2+16*3], m4 ;out0 2688 mova m4, [rsp+gprsize*2+16*6] ;t7a 2689 psubsw m2, m3, m7 ;t6 2690 paddsw m3, m7 ;-out3 2691 mova [rsp+gprsize*2+16*6], m3 ;-out3 2692 psubsw m3, m0, m4 ;t7 2693 paddsw m0, m4 ;out12 2694 mova [rsp+gprsize*2+16*12], m3 2695 mova m3, [rsp+gprsize*2+16*7] ;t3 2696 mova [rsp+gprsize*2+16* 7], m2 ;out4 2697 psubsw m2, m5, m3 ;t3a 2698 paddsw m5, m3 ;-out15 2699 mova [rsp+gprsize*2+16*11], m2 2700 mova m2, [rsp+gprsize*2+32*5] ;t15 2701 mova [rsp+gprsize*2+16*10], m1 ;-out7 2702 mova m1, [rsp+gprsize*2+16*0] ;t11 2703 mova [rsp+gprsize*2+16*0 ], m5 ;-out15 2704 mova m3, [rsp+gprsize*2+16*1] ;t10 2705 mova [rsp+gprsize*2+16*1 ], m4 ;-out11 2706 mova m4, [rsp+gprsize*2+16*2] ;t14 2707 mova [rsp+gprsize*2+16*2 ], m0 ;out12 2708 psubsw m0, m3, m4 ;t14a 2709 paddsw m3, m4 ;t10a 2710 psubsw m5, m1, m2 ;t15a 2711 paddsw m1, m2 ;t11a 2712 ITX_MULSUB_2W 5, 0, 2, 4, 6, 3784, 1567 ;t14, t15 2713 mova m2, [rsp+gprsize*2+16*4] ;t8 2714 mova m4, [rsp+gprsize*2+16*5] ;t9 2715 mova [rsp+gprsize*2+16*4], m3 ;t10a 2716 mova [rsp+gprsize*2+16*5], m1 ;t11a 2717 mova m3, [rsp+gprsize*2+16*8] ;t12 2718 mova m1, [rsp+gprsize*2+16*9] ;t13 2719 mova [rsp+gprsize*2+16*8], m5 ;t14 2720 mova [rsp+gprsize*2+16*9], m0 ;t15 2721 psubsw m5, m2, m3 ;t12a 2722 paddsw m2, m3 ;t8a 2723 psubsw m0, m4, m1 ;t13a 2724 paddsw m4, m1 ;t9a 2725 ITX_MULSUB_2W 5, 0, 1, 3, 6, 1567, 3784 ;t13, t12 2726 mova m6, [rsp+gprsize*2+16*4] ;t10a 2727 mova m1, [rsp+gprsize*2+16*5] ;t11a 2728 psubsw m3, m2, m6 ;t10 2729 paddsw m2, m6 ;-out1 2730 paddsw m6, m4, m1 ;out14 2731 psubsw m4, m1 ;t11 2732 mova [rsp+gprsize*2+16*14], m4 2733 mova [rsp+gprsize*2+16* 4], m2 ;-out1 2734 mova m4, [rsp+gprsize*2+16*8] ;t14 2735 mova m2, [rsp+gprsize*2+16*9] ;t15 2736 mova [rsp+gprsize*2+16* 9], m3 ;out6 2737 psubsw m3, m0, m4 ;t14a 2738 paddsw m0, m4 ;out2 2739 psubsw m4, m5, m2 ;t15a 2740 paddsw m5, m2 ;-out13 2741 mova [rsp+gprsize*2+16* 5], m0 ;out2 2742 ret 2743ALIGN function_align 2744.main_pass1_end: 2745 mova m0, [rsp+gprsize*2+16*14] 2746 mova [rsp+gprsize*2+16*14], m5 2747 mova [rsp+gprsize*2+16*15], m6 2748 mova m5, [o(pw_2896_2896)] 2749 mova m6, [o(pw_2896_m2896)] 2750 mova m7, [o(pd_2048)] 2751 punpcklwd m2, m3, m4 2752 punpckhwd m3, m4 2753 pmaddwd m4, m5, m2 2754 pmaddwd m2, m6 2755 pmaddwd m1, m5, m3 2756 pmaddwd m3, m6 2757 REPX {paddd x, m7}, m4, m2, m1, m3 2758 REPX {psrad x, 12}, m4, m1, m2, m3 2759 packssdw m4, m1 ;-out5 2760 packssdw m2, m3 ;out10 2761 mova [rsp+gprsize*2+16* 8], m4 2762 mova m3, [rsp+gprsize*2+16* 9] 2763 punpcklwd m1, m3, m0 2764 punpckhwd m3, m0 2765 pmaddwd m0, m5, m1 2766 pmaddwd m1, m6 2767 pmaddwd m4, m5, m3 2768 pmaddwd m3, m6 2769 REPX {paddd x, m7}, m0, m1, m4, m3 2770 REPX {psrad x, 12}, m0, m4, m1, m3 2771 packssdw m0, m4 ;out6 2772 packssdw m1, m3 ;-out9 2773 mova [rsp+gprsize*2+16* 9], m0 2774 mova m0, [rsp+gprsize*2+16* 7] 2775 mova m4, [rsp+gprsize*2+16*12] 2776 punpcklwd m3, m0, m4 2777 punpckhwd m0, m4 2778 pmaddwd m4, m5, m3 2779 pmaddwd m3, m6 2780 pmaddwd m5, m0 2781 pmaddwd m0, m6 2782 REPX {paddd x, m7}, m4, m3, m5, m0 2783 REPX {psrad x, 12}, m4, m5, m3, m0 2784 packssdw m4, m5 ;out4 2785 packssdw m3, m0 ;-out11 2786 mova [rsp+gprsize*2+16* 7], m4 2787 mova m4, [rsp+gprsize*2+16*10] 2788 mova m5, [rsp+gprsize*2+16*11] 2789 punpcklwd m0, m4, m5 2790 punpckhwd m4, m5 2791 pmaddwd m5, m0, [o(pw_2896_2896)] 2792 pmaddwd m0, m6 2793 pmaddwd m6, m4 2794 pmaddwd m4, [o(pw_2896_2896)] 2795 REPX {paddd x, m7}, m5, m0, m6, m4 2796 REPX {psrad x, 12}, m0, m6, m5, m4 2797 packssdw m0, m6 ;out8 2798 packssdw m5, m4 ;-out7 2799 mova [rsp+gprsize*2+16*10], m5 2800 mova m4, [rsp+gprsize*2+16* 2] ;out12 2801 mova m5, [rsp+gprsize*2+16*14] ;-out13 2802 mova m6, [rsp+gprsize*2+16*15] ;out14 2803 ret 2804ALIGN function_align 2805cglobal_label .main_pass2_end 2806 mova m7, [o(pw_2896x8)] 2807 mova m1, [rsp+gprsize*2+16* 9] 2808 mova m2, [rsp+gprsize*2+16*14] 2809 paddsw m0, m1, m2 2810 psubsw m1, m2 2811 pmulhrsw m0, m7 ;out6 2812 pmulhrsw m1, m7 ;-out9 2813 mova [rsp+gprsize*2+16* 9], m0 2814 psubsw m2, m3, m4 2815 paddsw m3, m4 2816 pmulhrsw m2, m7 ;out10 2817 pmulhrsw m3, m7 ;-out5 2818 mova [rsp+gprsize*2+16* 8], m3 2819 mova m3, [rsp+gprsize*2+16* 7] 2820 mova m4, [rsp+gprsize*2+16*12] 2821 paddsw m0, m3, m4 2822 psubsw m3, m4 2823 pmulhrsw m0, m7 ;out4 2824 pmulhrsw m3, m7 ;-out11 2825 mova [rsp+gprsize*2+16* 7], m0 2826 mova m0, [rsp+gprsize*2+16*10] 2827 paddsw m4, m0, [rsp+gprsize*2+16*11] 2828 psubsw m0, [rsp+gprsize*2+16*11] 2829 pmulhrsw m4, m7 ;-out7 2830 pmulhrsw m0, m7 ;out8 2831 mova [rsp+gprsize*2+16*10], m4 2832 mova m4, [rsp+gprsize*2+16*2 ] ;out12 2833 ret 2834 2835INV_TXFM_16X8_FN flipadst, dct 2836INV_TXFM_16X8_FN flipadst, adst 2837INV_TXFM_16X8_FN flipadst, flipadst 2838INV_TXFM_16X8_FN flipadst, identity 2839 2840cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2841 mova m7, [o(pw_2896x8)] 2842 pmulhrsw m0, m7, [coeffq+16*0 ] 2843 pmulhrsw m1, m7, [coeffq+16*1 ] 2844 pmulhrsw m2, m7, [coeffq+16*14] 2845 pmulhrsw m3, m7, [coeffq+16*15] 2846 mova [rsp+gprsize+16*7], m0 2847 mova [rsp+gprsize+16*8], m1 2848 mova [rsp+gprsize+16*9], m2 2849 mova [rsp+gprsize+32*5], m3 2850 pmulhrsw m0, m7, [coeffq+16*6 ] 2851 pmulhrsw m1, m7, [coeffq+16*7 ] 2852 pmulhrsw m2, m7, [coeffq+16*8 ] 2853 pmulhrsw m3, m7, [coeffq+16*9 ] 2854 mova [rsp+gprsize+16*3], m2 2855 mova [rsp+gprsize+16*4], m3 2856 mova [rsp+gprsize+16*5], m0 2857 mova [rsp+gprsize+16*6], m1 2858 pmulhrsw m0, m7, [coeffq+16*2 ] 2859 pmulhrsw m1, m7, [coeffq+16*3 ] 2860 pmulhrsw m2, m7, [coeffq+16*4 ] 2861 pmulhrsw m3, m7, [coeffq+16*5 ] 2862 pmulhrsw m4, m7, [coeffq+16*10] 2863 pmulhrsw m5, m7, [coeffq+16*11] 2864 pmulhrsw m6, m7, [coeffq+16*12] 2865 pmulhrsw m7, [coeffq+16*13] 2866 2867 call m(iadst_16x8_internal_8bpc).main 2868 call m(iadst_16x8_internal_8bpc).main_pass1_end 2869 2870 mova m7, [rsp+gprsize+16*0] 2871 SAVE_8ROWS coeffq+16*0, 32 2872 LOAD_8ROWS rsp+gprsize+16*3, 16 2873 mova [rsp+gprsize+16*0], m7 2874 mov r3, tx2q 2875 lea tx2q, [o(.pass1_end)] 2876 jmp m(iflipadst_8x8_internal_8bpc).pass1_end 2877 2878.pass1_end: 2879 SAVE_8ROWS coeffq+16*1, 32 2880 LOAD_8ROWS coeffq+16*0, 32 2881 mova [rsp+gprsize+16*0], m7 2882 mov tx2q, r3 2883 jmp m(iflipadst_8x8_internal_8bpc).pass1_end 2884 2885.pass2: 2886 lea tx2q, [o(.end)] 2887 lea r3, [dstq+8] 2888 jmp m(iflipadst_8x8_internal_8bpc).pass2_main 2889 2890.end: 2891 LOAD_8ROWS coeffq+16*1, 32 2892 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 2893 mov dstq, r3 2894 jmp m(iflipadst_8x8_internal_8bpc).pass2_main 2895 2896 2897INV_TXFM_16X8_FN identity, dct 2898INV_TXFM_16X8_FN identity, adst 2899INV_TXFM_16X8_FN identity, flipadst 2900INV_TXFM_16X8_FN identity, identity 2901 2902cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2903 add coeffq, 16*16 2904 mova m4, [coeffq-16*7] 2905 mova m5, [coeffq-16*5] 2906 mova m6, [coeffq-16*3] 2907 mova m7, [coeffq-16*1] 2908 mov r3, tx2q 2909 lea tx2q, [o(.pass1_end)] 2910 2911.pass1: 2912 mova m0, [o(pw_2896x8)] 2913 mova m2, [o(pw_1697x16)] 2914 mova m3, [o(pw_16384)] 2915 sub coeffq, 8*16 2916 REPX {pmulhrsw x, m0}, m4, m5, m6, m7 2917 pmulhrsw m1, m2, m4 2918 pmulhrsw m1, m3 2919 paddsw m1, m4 ; 1 2920 pmulhrsw m4, m2, m5 2921 pmulhrsw m4, m3 2922 paddsw m4, m5 ; 3 2923 pmulhrsw m5, m2, m6 2924 pmulhrsw m5, m3 2925 paddsw m5, m6 ; 5 2926 pmulhrsw m6, m2, m7 2927 pmulhrsw m6, m3 2928 paddsw m7, m6 ; 7 2929 pmulhrsw m6, m0, [coeffq+16*6] 2930 mova [rsp+gprsize+16*0], m4 2931 pmulhrsw m4, m2, m6 2932 pmulhrsw m4, m3 2933 paddsw m6, m4 ; 6 2934 pmulhrsw m4, m0, [coeffq+16*4] 2935 mova [rsp+gprsize+16*1], m6 2936 pmulhrsw m6, m2, m4 2937 pmulhrsw m6, m3 2938 paddsw m4, m6 ; 4 2939 pmulhrsw m6, m0, [coeffq+16*2] 2940 pmulhrsw m0, [coeffq+16*0] 2941 pmulhrsw m2, m6 2942 pmulhrsw m2, m3 2943 paddsw m2, m6 ; 2 2944 pmulhrsw m6, m0, [o(pw_1697x16)] 2945 pmulhrsw m6, m3 2946 mova m3, [rsp+gprsize+16*0] 2947 paddsw m0, m6 2948 jmp m(idct_8x8_internal_8bpc).pass1_end3 2949 2950.pass1_end: 2951 mova [coeffq+16*1], m4 2952 mova [coeffq+16*3], m5 2953 mova [coeffq+16*5], m6 2954 mova [coeffq+16*7], m7 2955 mova m4, [coeffq-16*7] 2956 mova m5, [coeffq-16*5] 2957 mova m6, [coeffq-16*3] 2958 mova m7, [coeffq-16*1] 2959 mova [coeffq-16*7], m0 2960 mova [coeffq-16*5], m1 2961 mova [coeffq-16*3], m2 2962 mova [coeffq-16*1], m3 2963 mov tx2q, r3 2964 jmp .pass1 2965 2966.pass2: 2967 lea tx2q, [o(.end)] 2968 lea r3, [dstq+8] 2969 jmp m(iidentity_8x8_internal_8bpc).end 2970 2971.end: 2972 LOAD_8ROWS coeffq+16*1, 32 2973 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 2974 mov dstq, r3 2975 jmp m(iidentity_8x8_internal_8bpc).end 2976 2977 2978%macro INV_TXFM_16X16_FN 2 ; type1, type2 2979 INV_TXFM_FN %1, %2, 16x16, 8, 16*16 2980%ifidn %1_%2, dct_dct 2981 movd m1, [o(pw_2896x8)] 2982 pmulhrsw m0, m1, [coeffq] 2983 movd m2, [o(pw_8192)] 2984 mov [coeffq], eobd 2985 mov r2d, 8 2986 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)] 2987 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly 2988.end: 2989 RET 2990%endif 2991%endmacro 2992 2993INV_TXFM_16X16_FN dct, dct 2994INV_TXFM_16X16_FN dct, adst 2995INV_TXFM_16X16_FN dct, flipadst 2996INV_TXFM_16X16_FN dct, identity 2997 2998cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2999 LOAD_8ROWS coeffq+16*1, 64 3000 call m(idct_8x8_internal_8bpc).main 3001 SAVE_7ROWS rsp+gprsize+16*3, 16 3002 LOAD_8ROWS coeffq+16*3, 64 3003 call m(idct_16x8_internal_8bpc).main 3004 mov r3, tx2q 3005 lea tx2q, [o(.pass1_end)] 3006 mova m7, [o(pw_8192)] 3007 jmp m(idct_8x8_internal_8bpc).pass1_end1 3008 3009.pass1_end: 3010 SAVE_8ROWS coeffq+16*17, 32 3011 LOAD_8ROWS rsp+gprsize+16*3, 16 3012 mova [rsp+gprsize+16*0], m7 3013 lea tx2q, [o(.pass1_end1)] 3014 mova m7, [o(pw_8192)] 3015 jmp m(idct_8x8_internal_8bpc).pass1_end1 3016 3017.pass1_end1: 3018 SAVE_8ROWS coeffq+16*1, 32 3019 LOAD_8ROWS coeffq+16*0, 64 3020 call m(idct_8x8_internal_8bpc).main 3021 SAVE_7ROWS rsp+gprsize+16*3, 16 3022 LOAD_8ROWS coeffq+16*2, 64 3023 call m(idct_16x8_internal_8bpc).main 3024 lea tx2q, [o(.pass1_end2)] 3025 mova m7, [o(pw_8192)] 3026 jmp m(idct_8x8_internal_8bpc).pass1_end1 3027 3028.pass1_end2: 3029 SAVE_8ROWS coeffq+16*16, 32 3030 LOAD_8ROWS rsp+gprsize+16*3, 16 3031 mova [rsp+gprsize+16*0], m7 3032 mov tx2q, r3 3033 mova m7, [o(pw_8192)] 3034 jmp m(idct_8x8_internal_8bpc).pass1_end1 3035 3036.pass2: 3037 lea tx2q, [o(.end)] 3038 jmp m(idct_8x16_internal_8bpc).pass2_pre 3039 3040.end: 3041 LOAD_8ROWS rsp+gprsize+16*3, 16 3042 mova [rsp+gprsize+16*0], m7 3043 lea tx2q, [o(.end1)] 3044 mov dstq, r3 3045 lea r3, [dstq+8] 3046 jmp m(idct_8x8_internal_8bpc).end 3047 3048.end1: 3049 pxor m7, m7 3050 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 3051 3052 add coeffq, 32*8 3053 mov dstq, r3 3054 3055 mova m0, [coeffq+16*0 ] 3056 mova m1, [coeffq+16*4 ] 3057 mova m2, [coeffq+16*8 ] 3058 mova m3, [coeffq+16*12] 3059 mova m4, [coeffq+16*1 ] 3060 mova m5, [coeffq+16*5 ] 3061 mova m6, [coeffq+16*9 ] 3062 mova m7, [coeffq+16*13] 3063 lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] 3064 jmp m(idct_8x16_internal_8bpc).pass2_main 3065 3066 3067%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0 3068 mova m0, [coeffq+16*1 ] 3069 mova m1, [coeffq+16*3 ] 3070 mova m2, [coeffq+16*29] 3071 mova m3, [coeffq+16*31] 3072 mova [rsp+gprsize+16*7], m0 3073 mova [rsp+gprsize+16*8], m1 3074 mova [rsp+gprsize+16*9], m2 3075 mova [rsp+gprsize+32*5], m3 3076 mova m0, [coeffq+16*13] 3077 mova m1, [coeffq+16*15] 3078 mova m2, [coeffq+16*17] 3079 mova m3, [coeffq+16*19] 3080 mova [rsp+gprsize+16*3], m2 3081 mova [rsp+gprsize+16*4], m3 3082 mova [rsp+gprsize+16*5], m0 3083 mova [rsp+gprsize+16*6], m1 3084 mova m0, [coeffq+16*5 ] 3085 mova m1, [coeffq+16*7 ] 3086 mova m2, [coeffq+16*9 ] 3087 mova m3, [coeffq+16*11] 3088 mova m4, [coeffq+16*21] 3089 mova m5, [coeffq+16*23] 3090 mova m6, [coeffq+16*25] 3091 mova m7, [coeffq+16*27] 3092%endmacro 3093 3094%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0 3095 mova m0, [coeffq+16*0 ] 3096 mova m1, [coeffq+16*2 ] 3097 mova m2, [coeffq+16*28] 3098 mova m3, [coeffq+16*30] 3099 mova [rsp+gprsize+16*7], m0 3100 mova [rsp+gprsize+16*8], m1 3101 mova [rsp+gprsize+16*9], m2 3102 mova [rsp+gprsize+32*5], m3 3103 mova m0, [coeffq+16*12] 3104 mova m1, [coeffq+16*14] 3105 mova m2, [coeffq+16*16] 3106 mova m3, [coeffq+16*18] 3107 mova [rsp+gprsize+16*3], m2 3108 mova [rsp+gprsize+16*4], m3 3109 mova [rsp+gprsize+16*5], m0 3110 mova [rsp+gprsize+16*6], m1 3111 mova m0, [coeffq+16*4 ] 3112 mova m1, [coeffq+16*6 ] 3113 mova m2, [coeffq+16*8 ] 3114 mova m3, [coeffq+16*10] 3115 mova m4, [coeffq+16*20] 3116 mova m5, [coeffq+16*22] 3117 mova m6, [coeffq+16*24] 3118 mova m7, [coeffq+16*26] 3119%endmacro 3120 3121INV_TXFM_16X16_FN adst, dct 3122INV_TXFM_16X16_FN adst, adst 3123INV_TXFM_16X16_FN adst, flipadst 3124 3125cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 3126 ITX_16X16_ADST_LOAD_ODD_COEFS 3127 call m(iadst_16x8_internal_8bpc).main 3128 call m(iadst_16x8_internal_8bpc).main_pass1_end 3129 3130 mov r3, tx2q 3131 lea tx2q, [o(.pass1_end)] 3132 mova m7, [o(pw_8192)] 3133 jmp m(iadst_8x8_internal_8bpc).pass1_end1 3134 3135.pass1_end: 3136 SAVE_8ROWS coeffq+16*17, 32 3137 LOAD_8ROWS rsp+gprsize+16*3, 16 3138 mova [rsp+gprsize+16*0], m7 3139 lea tx2q, [o(.pass1_end1)] 3140 mova m7, [o(pw_8192)] 3141 jmp m(iadst_8x8_internal_8bpc).pass1_end1 3142 3143.pass1_end1: 3144 SAVE_8ROWS coeffq+16*1, 32 3145 ITX_16X16_ADST_LOAD_EVEN_COEFS 3146 call m(iadst_16x8_internal_8bpc).main 3147 call m(iadst_16x8_internal_8bpc).main_pass1_end 3148 3149 lea tx2q, [o(.pass1_end2)] 3150 mova m7, [o(pw_8192)] 3151 jmp m(iadst_8x8_internal_8bpc).pass1_end1 3152 3153.pass1_end2: 3154 SAVE_8ROWS coeffq+16*16, 32 3155 LOAD_8ROWS rsp+gprsize+16*3, 16 3156 mova [rsp+gprsize+16*0], m7 3157 mov tx2q, r3 3158 mova m7, [o(pw_8192)] 3159 jmp m(iadst_8x8_internal_8bpc).pass1_end1 3160 3161.pass2: 3162 lea tx2q, [o(.end)] 3163 jmp m(iadst_8x16_internal_8bpc).pass2_pre 3164 3165.end: 3166 LOAD_8ROWS rsp+gprsize+16*3, 16 3167 mova [rsp+gprsize+16*0], m7 3168 lea tx2q, [o(.end1)] 3169 mov dstq, r3 3170 lea r3, [dstq+8] 3171 jmp m(iadst_8x8_internal_8bpc).end 3172 3173.end1: 3174 pxor m7, m7 3175 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 3176 3177 add coeffq, 32*8 3178 mov dstq, r3 3179 3180 mova m4, [coeffq+16*0 ] 3181 mova m5, [coeffq+16*2 ] 3182 mova m0, [coeffq+16*4 ] 3183 mova m1, [coeffq+16*6 ] 3184 mova m2, [coeffq+16*8 ] 3185 mova m3, [coeffq+16*10] 3186 mova m6, [coeffq+16*12] 3187 mova m7, [coeffq+16*14] 3188 mova [rsp+gprsize+16*7], m4 3189 mova [rsp+gprsize+16*8], m5 3190 mova [rsp+gprsize+16*5], m6 3191 mova [rsp+gprsize+16*6], m7 3192 lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] 3193 jmp m(iadst_8x16_internal_8bpc).pass2_main 3194 3195 3196INV_TXFM_16X16_FN flipadst, dct 3197INV_TXFM_16X16_FN flipadst, adst 3198INV_TXFM_16X16_FN flipadst, flipadst 3199 3200cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 3201 ITX_16X16_ADST_LOAD_ODD_COEFS 3202 call m(iadst_16x8_internal_8bpc).main 3203 call m(iadst_16x8_internal_8bpc).main_pass1_end 3204 3205 mov r3, tx2q 3206 lea tx2q, [o(.pass1_end)] 3207 mova m7, [o(pw_m8192)] 3208 jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 3209 3210.pass1_end: 3211 SAVE_8ROWS coeffq+16*1, 32 3212 LOAD_8ROWS rsp+gprsize+16*3, 16 3213 mova [rsp+gprsize+16*0], m7 3214 lea tx2q, [o(.pass1_end1)] 3215 mova m7, [o(pw_m8192)] 3216 jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 3217 3218.pass1_end1: 3219 SAVE_8ROWS coeffq+16*17, 32 3220 ITX_16X16_ADST_LOAD_EVEN_COEFS 3221 call m(iadst_16x8_internal_8bpc).main 3222 call m(iadst_16x8_internal_8bpc).main_pass1_end 3223 3224 mova m7, [rsp+gprsize+16*0] 3225 SAVE_8ROWS coeffq+16*0, 32 3226 LOAD_8ROWS rsp+gprsize+16*3, 16 3227 mova [rsp+gprsize+16*0], m7 3228 lea tx2q, [o(.pass1_end2)] 3229 mova m7, [o(pw_m8192)] 3230 jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 3231 3232.pass1_end2: 3233 SAVE_8ROWS coeffq+16*16, 32 3234 LOAD_8ROWS coeffq+16* 0, 32 3235 mova [rsp+gprsize+16*0], m7 3236 mov tx2q, r3 3237 mova m7, [o(pw_m8192)] 3238 jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 3239 3240.pass2: 3241 lea tx2q, [o(.end)] 3242 lea r3, [dstq+8] 3243 jmp m(iflipadst_8x16_internal_8bpc).pass2_pre 3244 3245.end: 3246 LOAD_8ROWS rsp+gprsize+16*3, 16 3247 mova [rsp+gprsize+16*0], m7 3248 lea tx2q, [o(.end1)] 3249 lea dstq, [dstq+strideq*2] 3250 jmp m(iflipadst_8x8_internal_8bpc).end 3251 3252.end1: 3253 pxor m7, m7 3254 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 3255 3256 add coeffq, 32*8 3257 3258 mova m4, [coeffq+16*0 ] 3259 mova m5, [coeffq+16*2 ] 3260 mova m0, [coeffq+16*4 ] 3261 mova m1, [coeffq+16*6 ] 3262 mova m2, [coeffq+16*8 ] 3263 mova m3, [coeffq+16*10] 3264 mova m6, [coeffq+16*12] 3265 mova m7, [coeffq+16*14] 3266 mova [rsp+gprsize+16*7], m4 3267 mova [rsp+gprsize+16*8], m5 3268 mova [rsp+gprsize+16*5], m6 3269 mova [rsp+gprsize+16*6], m7 3270 3271 lea tx2q, [o(.end2)] 3272 mov dstq, r3 3273 jmp m(iflipadst_8x16_internal_8bpc).pass2_main 3274 3275.end2: 3276 LOAD_8ROWS rsp+gprsize+16*3, 16 3277 mova [rsp+gprsize+16*0], m7 3278 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 3279 lea dstq, [dstq+strideq*2] 3280 jmp m(iflipadst_8x8_internal_8bpc).end 3281 3282 3283%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 3284 pmulhrsw m%2, m%3, m%1 3285 psraw m%2, 1 3286 pavgw m%1, m%2 3287%endmacro 3288 3289INV_TXFM_16X16_FN identity, dct 3290INV_TXFM_16X16_FN identity, identity 3291 3292cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 3293 add coeffq, 16*17 3294 mov r3, tx2q 3295 lea tx2q, [o(.pass1_end)] 3296 3297.pass1: 3298 mova m6, [o(pw_1697x16)] 3299 mova m7, [coeffq+32*6] 3300 mova m0, [coeffq+32*0] 3301 mova m1, [coeffq+32*1] 3302 mova m2, [coeffq+32*2] 3303 mova m3, [coeffq+32*3] 3304 mova m4, [coeffq+32*4] 3305 REPX {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4 3306 mova m5, [coeffq+32*5] 3307 mova [rsp+gprsize+16*1], m7 3308 IDTX16B 5, 7, 6 3309 mova m7, [coeffq+32*7] 3310 IDTX16B 7, 6, 6 3311 jmp m(idct_8x8_internal_8bpc).pass1_end3 3312 3313.pass1_end: 3314 SAVE_8ROWS coeffq, 32 3315 sub coeffq, 16 3316 lea tx2q, [o(.pass1_end1)] 3317 jmp .pass1 3318 3319.pass1_end1: 3320 SAVE_8ROWS coeffq, 32 3321 sub coeffq, 15*16 3322 lea tx2q, [o(.pass1_end2)] 3323 jmp .pass1 3324 3325.pass1_end2: 3326 SAVE_8ROWS coeffq, 32 3327 sub coeffq, 16 3328 mov tx2q, r3 3329 jmp .pass1 3330 3331.pass2: 3332 lea r3, [dstq+8] 3333 lea tx2q, [o(.end1)] 3334 3335.end: 3336 mova [rsp+gprsize+16*0], m7 3337 mova [rsp+gprsize+16*1], m4 3338 mova m7, [o(pw_1697x16)] 3339 REPX {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3 3340 mova m4, [o(pw_2048)] 3341 pmulhrsw m5, m4 3342 pmulhrsw m6, m4 3343 mova [rsp+gprsize+16*2], m5 3344 mova m5, [rsp+gprsize+16*1] 3345 mova [rsp+gprsize+16*1], m6 3346 IDTX16 5, 6, 7 3347 mova m6, [rsp+gprsize+16*0] 3348 IDTX16 6, 7, 7 3349 REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6 3350 pmulhrsw m4, m5 3351 mova [rsp+gprsize+16*0], m6 3352 jmp m(idct_8x8_internal_8bpc).end3 3353 3354.end1: 3355 LOAD_8ROWS coeffq+16*1, 32 3356 lea tx2q, [o(.end2)] 3357 lea dstq, [dstq+strideq*2] 3358 jmp .end 3359 3360.end2: 3361 pxor m7, m7 3362 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 3363 3364 add coeffq, 32*8 3365 LOAD_8ROWS coeffq, 32 3366 lea tx2q, [o(.end3)] 3367 mov dstq, r3 3368 jmp .end 3369 3370.end3: 3371 LOAD_8ROWS coeffq+16*1, 32 3372 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 3373 lea dstq, [dstq+strideq*2] 3374 jmp .end 3375 3376 3377cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 3378%if ARCH_X86_32 3379 LEA r5, $$ 3380%endif 3381 test eobd, eobd 3382 jz .dconly 3383 call m(idct_8x32_internal_8bpc) 3384 RET 3385 3386.dconly: 3387 movd m1, [o(pw_2896x8)] 3388 pmulhrsw m0, m1, [coeffq] 3389 movd m2, [o(pw_8192)] 3390 mov [coeffq], eobd 3391 pmulhrsw m0, m2 3392 psrlw m2, 2 ;pw_2048 3393 pmulhrsw m0, m1 3394 pmulhrsw m0, m2 3395 pshuflw m0, m0, q0000 3396 punpcklwd m0, m0 3397 mov r3d, 8 3398 lea tx2q, [o(.end)] 3399 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop 3400 3401.end: 3402 RET 3403 3404 3405 3406cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 3407 cmp eobd, 106 3408 jle .fast 3409 3410 LOAD_8ROWS coeffq+16*3, 64 3411 call m(idct_8x8_internal_8bpc).main 3412 mova m7, [o(pw_8192)] 3413 lea tx2q, [o(.pass1)] 3414 jmp m(idct_8x8_internal_8bpc).pass1_end1 3415 3416.pass1: 3417 mova [rsp+gprsize+16*9 ], m0 ;in24 3418 mova [rsp+gprsize+16*10], m4 ;in28 3419 mova [rsp+gprsize+16*17], m2 ;in26 3420 mova [rsp+gprsize+16*18], m6 ;in30 3421 mova [rsp+gprsize+16*31], m1 ;in25 3422 mova [rsp+gprsize+16*30], m3 ;in27 3423 mova [rsp+gprsize+16*27], m5 ;in29 3424 mova [rsp+gprsize+16*34], m7 ;in31 3425 LOAD_8ROWS coeffq+16*2, 64 3426 call m(idct_8x8_internal_8bpc).main 3427 mova m7, [o(pw_8192)] 3428 lea tx2q, [o(.pass1_1)] 3429 jmp m(idct_8x8_internal_8bpc).pass1_end1 3430 3431.pass1_1: 3432 mova [rsp+gprsize+16*7 ], m0 ;in16 3433 mova [rsp+gprsize+16*8 ], m4 ;in20 3434 mova [rsp+gprsize+16*15], m2 ;in18 3435 mova [rsp+gprsize+16*16], m6 ;in22 3436 mova [rsp+gprsize+16*33], m1 ;in17 3437 mova [rsp+gprsize+16*28], m3 ;in19 3438 mova [rsp+gprsize+16*29], m5 ;in21 3439 mova [rsp+gprsize+16*32], m7 ;in23 3440 3441.fast: 3442 LOAD_8ROWS coeffq+16*1, 64 3443 call m(idct_8x8_internal_8bpc).main 3444 mova m7, [o(pw_8192)] 3445 lea tx2q, [o(.pass1_end)] 3446 jmp m(idct_8x8_internal_8bpc).pass1_end1 3447 3448.pass1_end: 3449 mova [rsp+gprsize+16*5 ], m0 ;in8 3450 mova [rsp+gprsize+16*6 ], m4 ;in12 3451 mova [rsp+gprsize+16*13], m2 ;in10 3452 mova [rsp+gprsize+16*14], m6 ;in14 3453 mova [rsp+gprsize+16*21], m1 ;in9 3454 mova [rsp+gprsize+16*24], m3 ;in11 3455 mova [rsp+gprsize+16*25], m5 ;in13 3456 mova [rsp+gprsize+16*20], m7 ;in15 3457 LOAD_8ROWS coeffq+16*0, 64 3458 call m(idct_8x8_internal_8bpc).main 3459 mova m7, [o(pw_8192)] 3460 lea tx2q, [o(.pass1_end1)] 3461 jmp m(idct_8x8_internal_8bpc).pass1_end1 3462 3463.pass1_end1: 3464 mova [rsp+gprsize+16*11], m2 ;in2 3465 mova [rsp+gprsize+16*12], m6 ;in6 3466 mova [rsp+gprsize+16*19], m1 ;in1 3467 mova [rsp+gprsize+16*26], m3 ;in3 3468 mova [rsp+gprsize+16*23], m5 ;in5 3469 mova [rsp+gprsize+16*22], m7 ;in7 3470 mova m1, m4 ;in4 3471 mova m2, [rsp+gprsize+16*5 ] ;in8 3472 mova m3, [rsp+gprsize+16*6 ] ;in12 3473 3474 cmp eobd, 106 3475 jg .full 3476 3477 pxor m4, m4 3478 REPX {mova x, m4}, m5, m6, m7 3479 call m(idct_8x8_internal_8bpc).main 3480 SAVE_7ROWS rsp+gprsize+16*3 , 16 3481 mova m0, [rsp+gprsize+16*11] 3482 mova m1, [rsp+gprsize+16*12] 3483 mova m2, [rsp+gprsize+16*13] 3484 mova m3, [rsp+gprsize+16*14] 3485 pxor m4, m4 3486 REPX {mova x, m4}, m5, m6, m7 3487 call m(idct_16x8_internal_8bpc).main 3488 mova m7, [rsp+gprsize+16*0] 3489 SAVE_8ROWS rsp+gprsize+16*11, 16 3490 3491 call .main_fast 3492 jmp .pass2 3493 3494.full: 3495 mova m4, [rsp+gprsize+16*7 ] ;in16 3496 mova m5, [rsp+gprsize+16*8 ] ;in20 3497 mova m6, [rsp+gprsize+16*9 ] ;in24 3498 mova m7, [rsp+gprsize+16*10] ;in28 3499 call m(idct_8x8_internal_8bpc).main 3500 SAVE_7ROWS rsp+gprsize+16*3 , 16 3501 LOAD_8ROWS rsp+gprsize+16*11, 16 3502 call m(idct_16x8_internal_8bpc).main 3503 mova m7, [rsp+gprsize+16*0] 3504 SAVE_8ROWS rsp+gprsize+16*11, 16 3505 call .main 3506 3507.pass2: 3508 lea r3, [o(.end6)] 3509 3510.end: 3511 mova [rsp+gprsize+16*0 ], m7 3512 lea tx2q, [o(.end2)] 3513 3514.end1: 3515 pxor m7, m7 3516 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \ 3517 8, 9, 10, 11, 12, 13, 14, 15, \ 3518 16, 17, 18, 19, 20, 21, 22, 23, \ 3519 24, 25, 26, 27, 28, 29, 30, 31 3520 3521 jmp tx2q 3522 3523.end2: 3524 lea tx2q, [o(.end3)] 3525 jmp m(idct_8x8_internal_8bpc).end 3526 3527.end3: 3528 LOAD_8ROWS rsp+gprsize+16*11, 16 3529 mova [rsp+gprsize+16*0 ], m7 3530 lea dstq, [dstq+strideq*2] 3531 lea tx2q, [o(.end4)] 3532 jmp m(idct_8x8_internal_8bpc).end 3533 3534.end4: 3535 LOAD_8ROWS rsp+gprsize+16*19, 16 3536 mova [rsp+gprsize+16*0 ], m7 3537 lea dstq, [dstq+strideq*2] 3538 lea tx2q, [o(.end5)] 3539 jmp m(idct_8x8_internal_8bpc).end 3540 3541.end5: 3542 LOAD_8ROWS rsp+gprsize+16*27, 16 3543 mova [rsp+gprsize+16*0 ], m7 3544 lea dstq, [dstq+strideq*2] 3545 mov tx2q, r3 3546 jmp m(idct_8x8_internal_8bpc).end 3547 3548.end6: 3549 ret 3550 3551ALIGN function_align 3552cglobal_label .main_veryfast 3553 mova m0, [rsp+gprsize*2+16*19] ;in1 3554 pmulhrsw m3, m0, [o(pw_4091x8)] ;t30,t31 3555 pmulhrsw m0, [o(pw_201x8)] ;t16,t17 3556 mova m7, [o(pd_2048)] 3557 mova [rsp+gprsize*2+16*19], m0 ;t16 3558 mova [rsp+gprsize*2+16*34], m3 ;t31 3559 ITX_MULSUB_2W 3, 0, 1, 2, 7, 799, 4017 ;t17a, t30a 3560 mova [rsp+gprsize*2+16*20], m3 ;t17a 3561 mova [rsp+gprsize*2+16*33], m0 ;t30a 3562 mova m1, [rsp+gprsize*2+16*22] ;in7 3563 pmulhrsw m2, m1, [o(pw_3857x8)] ;t28,t29 3564 pmulhrsw m1, [o(pw_m1380x8)] ;t18,t19 3565 mova [rsp+gprsize*2+16*22], m1 ;t19 3566 mova [rsp+gprsize*2+16*31], m2 ;t28 3567 ITX_MULSUB_2W 2, 1, 0, 3, 7, m4017, 799 ;t18a, t29a 3568 mova [rsp+gprsize*2+16*21], m2 ;t18a 3569 mova [rsp+gprsize*2+16*32], m1 ;t29a 3570 mova m0, [rsp+gprsize*2+16*23] ;in5 3571 pmulhrsw m3, m0, [o(pw_3973x8)] ;t26, t27 3572 pmulhrsw m0, [o(pw_995x8)] ;t20, t21 3573 mova [rsp+gprsize*2+16*23], m0 ;t20 3574 mova [rsp+gprsize*2+16*30], m3 ;t27 3575 ITX_MULSUB_2W 3, 0, 1, 2, 7, 3406, 2276 ;t21a, t26a 3576 mova [rsp+gprsize*2+16*24], m3 ;t21a 3577 mova [rsp+gprsize*2+16*29], m0 ;t26a 3578 mova m2, [rsp+gprsize*2+16*26] ;in3 3579 pxor m0, m0 3580 mova m3, m0 3581 pmulhrsw m1, m2, [o(pw_4052x8)] 3582 pmulhrsw m2, [o(pw_m601x8)] 3583 jmp .main2 3584 3585ALIGN function_align 3586cglobal_label .main_fast ;bottom half is zero 3587 mova m0, [rsp+gprsize*2+16*19] ;in1 3588 mova m1, [rsp+gprsize*2+16*20] ;in15 3589 pmulhrsw m3, m0, [o(pw_4091x8)] ;t31a 3590 pmulhrsw m0, [o(pw_201x8)] ;t16a 3591 pmulhrsw m2, m1, [o(pw_3035x8)] ;t30a 3592 pmulhrsw m1, [o(pw_m2751x8)] ;t17a 3593 mova m7, [o(pd_2048)] 3594 psubsw m4, m0, m1 ;t17 3595 paddsw m0, m1 ;t16 3596 psubsw m5, m3, m2 ;t30 3597 paddsw m3, m2 ;t31 3598 ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a 3599 mova [rsp+gprsize*2+16*19], m0 ;t16 3600 mova [rsp+gprsize*2+16*20], m5 ;t17a 3601 mova [rsp+gprsize*2+16*33], m4 ;t30a 3602 mova [rsp+gprsize*2+16*34], m3 ;t31 3603 mova m0, [rsp+gprsize*2+16*21] ;in9 3604 mova m1, [rsp+gprsize*2+16*22] ;in7 3605 pmulhrsw m3, m0, [o(pw_3703x8)] 3606 pmulhrsw m0, [o(pw_1751x8)] 3607 pmulhrsw m2, m1, [o(pw_3857x8)] 3608 pmulhrsw m1, [o(pw_m1380x8)] 3609 psubsw m4, m1, m0 ;t18 3610 paddsw m0, m1 ;t19 3611 psubsw m5, m2, m3 ;t29 3612 paddsw m3, m2 ;t28 3613 ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a 3614 mova [rsp+gprsize*2+16*21], m5 ;t18a 3615 mova [rsp+gprsize*2+16*22], m0 ;t19 3616 mova [rsp+gprsize*2+16*31], m3 ;t28 3617 mova [rsp+gprsize*2+16*32], m4 ;t29a 3618 mova m0, [rsp+gprsize*2+16*23] ;in5 3619 mova m1, [rsp+gprsize*2+16*24] ;in11 3620 pmulhrsw m3, m0, [o(pw_3973x8)] 3621 pmulhrsw m0, [o(pw_995x8)] 3622 pmulhrsw m2, m1, [o(pw_3513x8)] 3623 pmulhrsw m1, [o(pw_m2106x8)] 3624 psubsw m4, m0, m1 ;t21 3625 paddsw m0, m1 ;t20 3626 psubsw m5, m3, m2 ;t26 3627 paddsw m3, m2 ;t27 3628 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a 3629 mova [rsp+gprsize*2+16*23], m0 ;t20 3630 mova [rsp+gprsize*2+16*24], m5 ;t21a 3631 mova [rsp+gprsize*2+16*29], m4 ;t26a 3632 mova [rsp+gprsize*2+16*30], m3 ;t27 3633 mova m0, [rsp+gprsize*2+16*25] ;in13 3634 mova m2, [rsp+gprsize*2+16*26] ;in3 3635 pmulhrsw m3, m0, [o(pw_3290x8)] 3636 pmulhrsw m0, [o(pw_2440x8)] 3637 pmulhrsw m1, m2, [o(pw_4052x8)] 3638 pmulhrsw m2, [o(pw_m601x8)] 3639 jmp .main2 3640 3641ALIGN function_align 3642cglobal_label .main 3643 mova m7, [o(pd_2048)] 3644 mova m0, [rsp+gprsize*2+16*19] ;in1 3645 mova m1, [rsp+gprsize*2+16*20] ;in15 3646 mova m2, [rsp+gprsize*2+16*33] ;in17 3647 mova m3, [rsp+gprsize*2+16*34] ;in31 3648 ITX_MULSUB_2W 0, 3, 4, 5, 7, 201, 4091 ;t16a, t31a 3649 ITX_MULSUB_2W 2, 1, 4, 5, 7, 3035, 2751 ;t17a, t30a 3650 psubsw m4, m0, m2 ;t17 3651 paddsw m0, m2 ;t16 3652 psubsw m5, m3, m1 ;t30 3653 paddsw m3, m1 ;t31 3654 ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a 3655 mova [rsp+gprsize*2+16*19], m0 ;t16 3656 mova [rsp+gprsize*2+16*20], m5 ;t17a 3657 mova [rsp+gprsize*2+16*33], m4 ;t30a 3658 mova [rsp+gprsize*2+16*34], m3 ;t31 3659 mova m0, [rsp+gprsize*2+16*21] ;in9 3660 mova m1, [rsp+gprsize*2+16*22] ;in7 3661 mova m2, [rsp+gprsize*2+16*31] ;in25 3662 mova m3, [rsp+gprsize*2+16*32] ;in23 3663 ITX_MULSUB_2W 0, 3, 4, 5, 7, 1751, 3703 ;t18a, t29a 3664 ITX_MULSUB_2W 2, 1, 4, 5, 7, 3857, 1380 ;t19a, t28a 3665 psubsw m4, m2, m0 ;t18 3666 paddsw m0, m2 ;t19 3667 psubsw m5, m1, m3 ;t29 3668 paddsw m3, m1 ;t28 3669 ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a 3670 mova [rsp+gprsize*2+16*21], m5 ;t18a 3671 mova [rsp+gprsize*2+16*22], m0 ;t19 3672 mova [rsp+gprsize*2+16*31], m3 ;t28 3673 mova [rsp+gprsize*2+16*32], m4 ;t29a 3674 mova m0, [rsp+gprsize*2+16*23] ;in5 3675 mova m1, [rsp+gprsize*2+16*24] ;in11 3676 mova m2, [rsp+gprsize*2+16*29] ;in21 3677 mova m3, [rsp+gprsize*2+16*30] ;in27 3678 ITX_MULSUB_2W 0, 3, 4, 5, 7, 995, 3973 ;t20a, t27a 3679 ITX_MULSUB_2W 2, 1, 4, 5, 7, 3513, 2106 ;t21a, t26a 3680 psubsw m4, m0, m2 ;t21 3681 paddsw m0, m2 ;t20 3682 psubsw m5, m3, m1 ;t26 3683 paddsw m3, m1 ;t27 3684 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a 3685 mova [rsp+gprsize*2+16*23], m0 ;t20 3686 mova [rsp+gprsize*2+16*24], m5 ;t21a 3687 mova [rsp+gprsize*2+16*29], m4 ;t26a 3688 mova [rsp+gprsize*2+16*30], m3 ;t27 3689 mova m0, [rsp+gprsize*2+16*25] ;in13 3690 mova m1, [rsp+gprsize*2+16*26] ;in3 3691 mova m2, [rsp+gprsize*2+16*27] ;in29 3692 mova m3, [rsp+gprsize*2+16*28] ;in19 3693 ITX_MULSUB_2W 0, 3, 4, 5, 7, 2440, 3290 ;t22a, t25a 3694 ITX_MULSUB_2W 2, 1, 4, 5, 7, 4052, 601 ;t23a, t24a 3695 3696.main2: 3697 psubsw m4, m2, m0 ;t22 3698 paddsw m0, m2 ;t23 3699 psubsw m5, m1, m3 ;t25 3700 paddsw m3, m1 ;t24 3701 ITX_MULSUB_2W 5, 4, 1, 2, 7, m2276, 3406 ;t22a, t25a 3702 mova m2, [rsp+gprsize*2+16*24] ;t21a 3703 psubsw m1, m5, m2 ;t21 3704 paddsw m5, m2 ;t22 3705 mova [rsp+gprsize*2+16*25], m5 ;t22 3706 mova m2, [rsp+gprsize*2+16*29] ;t26a 3707 psubsw m5, m4, m2 ;t26 3708 paddsw m4, m2 ;t25 3709 mova [rsp+gprsize*2+16*28], m4 ;t25 3710 ITX_MULSUB_2W 5, 1, 2, 4, 7, m3784, 1567 ;t21a, t26a 3711 mova [rsp+gprsize*2+16*24], m5 ;t21a 3712 mova [rsp+gprsize*2+16*29], m1 ;t26a 3713 3714 mova m1, [rsp+gprsize*2+16*23] ;t20 3715 mova m5, [rsp+gprsize*2+16*30] ;t27 3716 psubsw m2, m0, m1 ;t20a 3717 paddsw m0, m1 ;t23a 3718 psubsw m6, m3, m5 ;t27a 3719 paddsw m3, m5 ;t24a 3720 ITX_MULSUB_2W 6, 2, 1, 5, 7, m3784, 1567 ;t20, t27 3721 mova [rsp+gprsize*2+16*26], m0 ;t23a 3722 mova [rsp+gprsize*2+16*27], m3 ;t24a 3723 mova [rsp+gprsize*2+16*30], m2 ;t27 3724 3725 mova m0, [rsp+gprsize*2+16*20] ;t17a 3726 mova m1, [rsp+gprsize*2+16*21] ;t18a 3727 mova m2, [rsp+gprsize*2+16*32] ;t29a 3728 mova m3, [rsp+gprsize*2+16*33] ;t30a 3729 psubsw m4, m0, m1 ;t18 3730 paddsw m0, m1 ;t17 3731 psubsw m5, m3, m2 ;t29 3732 paddsw m3, m2 ;t30 3733 ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t18a, t29a 3734 mova [rsp+gprsize*2+16*20], m0 ;t17 3735 mova [rsp+gprsize*2+16*21], m5 ;t18a 3736 mova [rsp+gprsize*2+16*32], m4 ;t29a 3737 mova [rsp+gprsize*2+16*33], m3 ;t30 3738 mova m0, [rsp+gprsize*2+16*19] ;t16 3739 mova m1, [rsp+gprsize*2+16*22] ;t19 3740 mova m2, [rsp+gprsize*2+16*31] ;t28 3741 mova m3, [rsp+gprsize*2+16*34] ;t31 3742 psubsw m4, m0, m1 ;t19a 3743 paddsw m0, m1 ;t16a 3744 psubsw m5, m3, m2 ;t28a 3745 paddsw m3, m2 ;t31a 3746 ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t19, t28 3747 mova m2, [rsp+gprsize*2+16*15] ;tmp12 3748 psubsw m1, m5, m6 ;t20a 3749 paddsw m5, m6 ;t19a 3750 psubsw m6, m2, m5 ;out19 3751 paddsw m2, m5 ;out12 3752 mova m5, [rsp+gprsize*2+16*30] ;t27 3753 mova [rsp+gprsize*2+16*22], m6 ;out19 3754 mova [rsp+gprsize*2+16*15], m2 ;out12 3755 psubsw m6, m4, m5 ;t27a 3756 paddsw m4, m5 ;t28a 3757 ITX_MULSUB_2W 6, 1, 2, 5, 7, 2896, 2896 ;t20, t27 3758 mova m2, [rsp+gprsize*2+16*6 ] ;tmp3 3759 psubsw m5, m2, m4 ;out28 3760 paddsw m2, m4 ;out3 3761 mova m4, [rsp+gprsize*2+16*14] ;tmp11 3762 mova [rsp+gprsize*2+16*31], m5 ;out28 3763 mova [rsp+gprsize*2+16*6 ], m2 ;out3 3764 psubsw m5, m4, m6 ;out20 3765 paddsw m4, m6 ;out11 3766 mova m2, [rsp+gprsize*2+16*7 ] ;tmp4 3767 mova [rsp+gprsize*2+16*23], m5 ;out20 3768 mova [rsp+gprsize*2+16*14], m4 ;out11 3769 psubsw m5, m2, m1 ;out27 3770 paddsw m2, m1 ;out4 3771 mova m1, [rsp+gprsize*2+16*26] ;t23a 3772 mova m4, [rsp+gprsize*2+16*27] ;t24a 3773 mova [rsp+gprsize*2+16*30], m5 ;out27 3774 mova [rsp+gprsize*2+16*7 ], m2 ;out4 3775 psubsw m5, m0, m1 ;t23 3776 paddsw m0, m1 ;t16 3777 psubsw m2, m3, m4 ;t24 3778 paddsw m3, m4 ;t31 3779 ITX_MULSUB_2W 2, 5, 4, 6, 7, 2896, 2896 ;t23a, t24a 3780 mova m6, [rsp+gprsize*2+16*18] ;tmp15 3781 psubsw m4, m6, m0 ;out16 3782 paddsw m6, m0 ;out15 3783 mova m0, [rsp+gprsize*2+16*3 ] ;tmp0 3784 mova m1, [rsp+gprsize*2+16*11] ;tmp8 3785 mova [rsp+gprsize*2+16*18], m6 ;out15 3786 mova [rsp+gprsize*2+16*19], m4 ;out16 3787 psubsw m6, m0, m3 ;out31 3788 paddsw m0, m3 ;out0 3789 psubsw m4, m1, m2 ;out23 3790 paddsw m1, m2 ;out8 3791 mova m3, [rsp+gprsize*2+16*10] ;tmp7 3792 mova [rsp+gprsize*2+16*34], m6 ;out31 3793 mova [rsp+gprsize*2+16*11], m1 ;out8 3794 mova [rsp+gprsize*2+16*26], m4 ;out23 3795 paddsw m6, m3, m5 ;out7 3796 psubsw m3, m5 ;out24 3797 mova m1, [rsp+gprsize*2+16*20] ;t17 3798 mova m5, [rsp+gprsize*2+16*25] ;t22 3799 mova m2, [rsp+gprsize*2+16*17] ;tmp14 3800 mova [rsp+gprsize*2+16*27], m3 ;out24 3801 psubsw m4, m1, m5 ;t22a 3802 paddsw m1, m5 ;t17a 3803 psubsw m3, m2, m1 ;out17 3804 paddsw m2, m1 ;out14 3805 mova m5, [rsp+gprsize*2+16*28] ;t25 3806 mova m1, [rsp+gprsize*2+16*33] ;t30 3807 mova [rsp+gprsize*2+16*17], m2 ;out14 3808 mova [rsp+gprsize*2+16*20], m3 ;out17 3809 psubsw m2, m1, m5 ;t25a 3810 paddsw m1, m5 ;t30a 3811 ITX_MULSUB_2W 2, 4, 3, 5, 7, 2896, 2896 ;t22, t25 3812 mova m5, [rsp+gprsize*2+16*4 ] ;tmp1 3813 psubsw m3, m5, m1 ;out30 3814 paddsw m5, m1 ;out1 3815 mova m1, [rsp+gprsize*2+16*12] ;tmp9 3816 mova [rsp+gprsize*2+16*33], m3 ;out30 3817 mova [rsp+gprsize*2+16*4 ], m5 ;out1 3818 psubsw m3, m1, m2 ;out22 3819 paddsw m1, m2 ;out9 3820 mova m5, [rsp+gprsize*2+16*9 ] ;tmp6 3821 mova [rsp+gprsize*2+16*25], m3 ;out22 3822 mova [rsp+gprsize*2+16*12], m1 ;out9 3823 psubsw m3, m5, m4 ;out25 3824 paddsw m5, m4 ;out6 3825 mova m4, [rsp+gprsize*2+16*21] ;t18a 3826 mova m1, [rsp+gprsize*2+16*24] ;t21a 3827 mova m2, [rsp+gprsize*2+16*16] ;tmp13 3828 mova [rsp+gprsize*2+16*28], m3 ;out25 3829 mova [rsp+gprsize*2+16*9 ], m5 ;out6 3830 paddsw m3, m4, m1 ;t18 3831 psubsw m4, m1 ;t21 3832 psubsw m5, m2, m3 ;out18 3833 paddsw m2, m3 ;out13 3834 mova m1, [rsp+gprsize*2+16*29] ;t26a 3835 mova m3, [rsp+gprsize*2+16*32] ;t29a 3836 mova [rsp+gprsize*2+16*21], m5 ;out18 3837 mova [rsp+gprsize*2+16*16], m2 ;out13 3838 psubsw m5, m3, m1 ;t26 3839 paddsw m3, m1 ;t29 3840 ITX_MULSUB_2W 5, 4, 1, 2, 7, 2896, 2896 ;t21a, t26a 3841 mova m2, [rsp+gprsize*2+16*5 ] ;tmp2 3842 psubsw m1, m2, m3 ;out29 3843 paddsw m2, m3 ;out2 3844 mova m3, [rsp+gprsize*2+16*13] ;tmp10 3845 mova [rsp+gprsize*2+16*32], m1 ;out29 3846 psubsw m7, m3, m5 ;out21 3847 paddsw m3, m5 ;out10 3848 mova m5, [rsp+gprsize*2+16*8 ] ;tmp5 3849 mova [rsp+gprsize*2+16*24], m7 ;out21 3850 mova [rsp+gprsize*2+16*13], m3 ;out10 3851 psubsw m1, m5, m4 ;out26 3852 paddsw m5, m4 ;out5 3853 mova m7, m6 ;out7 3854 mova m3, [rsp+gprsize*2+16*6 ] ;out3 3855 mova m4, [rsp+gprsize*2+16*7 ] ;out4 3856 mova [rsp+gprsize*2+16*29], m1 ;out26 3857 mova m6, [rsp+gprsize*2+16*9 ] ;out6 3858 mova m1, [rsp+gprsize*2+16*4 ] ;out1 3859 ret 3860 3861 3862cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 3863%if ARCH_X86_32 3864 LEA r5, $$ 3865%endif 3866 test eobd, eobd 3867 jz .dconly 3868 call m(idct_32x8_internal_8bpc) 3869 RET 3870 3871.dconly: 3872 movd m1, [o(pw_2896x8)] 3873 pmulhrsw m0, m1, [coeffq] 3874 movd m2, [o(pw_8192)] 3875 mov [coeffq], eobd 3876 mov r3d, 8 3877 lea tx2q, [o(.end)] 3878 3879.body: 3880 pmulhrsw m0, m2 3881 movd m2, [o(pw_2048)] ;intentionally rip-relative 3882 pmulhrsw m0, m1 3883 pmulhrsw m0, m2 3884 pshuflw m0, m0, q0000 3885 punpcklwd m0, m0 3886 pxor m5, m5 3887 3888.loop: 3889 mova m1, [dstq+16*0] 3890 mova m3, [dstq+16*1] 3891 punpckhbw m2, m1, m5 3892 punpcklbw m1, m5 3893 punpckhbw m4, m3, m5 3894 punpcklbw m3, m5 3895 paddw m2, m0 3896 paddw m1, m0 3897 paddw m4, m0 3898 paddw m3, m0 3899 packuswb m1, m2 3900 packuswb m3, m4 3901 mova [dstq+16*0], m1 3902 mova [dstq+16*1], m3 3903 add dstq, strideq 3904 dec r3d 3905 jg .loop 3906 jmp tx2q 3907 3908.end: 3909 RET 3910 3911 3912cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 3913 LOAD_8ROWS coeffq+16*0, 64 3914 call m(idct_8x8_internal_8bpc).main 3915 SAVE_7ROWS rsp+gprsize+16*3, 16 3916 3917 LOAD_8ROWS coeffq+16*2, 64 3918 call m(idct_16x8_internal_8bpc).main 3919 mova m7, [rsp+gprsize+16*0] 3920 SAVE_8ROWS rsp+gprsize+16*11, 16 3921 3922 LOAD_8ROWS coeffq+16*1, 32 3923 mova [rsp+gprsize+16*19], m0 ;in1 3924 mova [rsp+gprsize+16*26], m1 ;in3 3925 mova [rsp+gprsize+16*23], m2 ;in5 3926 mova [rsp+gprsize+16*22], m3 ;in7 3927 mova [rsp+gprsize+16*21], m4 ;in9 3928 mova [rsp+gprsize+16*24], m5 ;in11 3929 mova [rsp+gprsize+16*25], m6 ;in13 3930 mova [rsp+gprsize+16*20], m7 ;in15 3931 3932 cmp eobd, 106 3933 jg .full 3934 call m(idct_8x32_internal_8bpc).main_fast 3935 jmp .pass2 3936 3937.full: 3938 LOAD_8ROWS coeffq+16*17, 32 3939 mova [rsp+gprsize+16*33], m0 ;in17 3940 mova [rsp+gprsize+16*28], m1 ;in19 3941 mova [rsp+gprsize+16*29], m2 ;in21 3942 mova [rsp+gprsize+16*32], m3 ;in23 3943 mova [rsp+gprsize+16*31], m4 ;in25 3944 mova [rsp+gprsize+16*30], m5 ;in27 3945 mova [rsp+gprsize+16*27], m6 ;in29 3946 mova [rsp+gprsize+16*34], m7 ;in31 3947 call m(idct_8x32_internal_8bpc).main 3948 3949.pass2: 3950 mova [rsp+gprsize+16*0 ], m7 3951 lea tx2q, [o(.end)] 3952 jmp m(idct_8x32_internal_8bpc).end1 3953 3954.end: 3955 mova m7, [o(pw_8192)] 3956 lea tx2q, [o(.end1)] 3957 jmp m(idct_8x8_internal_8bpc).pass1_end1 3958 3959.end1: 3960 lea r3, [dstq+8] 3961 lea tx2q, [o(.end2)] 3962 jmp m(idct_8x8_internal_8bpc).pass2_main 3963 3964.end2: 3965 LOAD_8ROWS rsp+gprsize+16*11, 16 3966 mova [rsp+gprsize+16*0 ], m7 3967 mova m7, [o(pw_8192)] 3968 lea tx2q, [o(.end3)] 3969 jmp m(idct_8x8_internal_8bpc).pass1_end1 3970 3971.end3: 3972 mov dstq, r3 3973 add r3, 8 3974 lea tx2q, [o(.end4)] 3975 jmp m(idct_8x8_internal_8bpc).pass2_main 3976 3977.end4: 3978 LOAD_8ROWS rsp+gprsize+16*19, 16 3979 mova [rsp+gprsize+16*0 ], m7 3980 mova m7, [o(pw_8192)] 3981 lea tx2q, [o(.end5)] 3982 jmp m(idct_8x8_internal_8bpc).pass1_end1 3983 3984.end5: 3985 mov dstq, r3 3986 add r3, 8 3987 lea tx2q, [o(.end6)] 3988 jmp m(idct_8x8_internal_8bpc).pass2_main 3989 3990.end6: 3991 LOAD_8ROWS rsp+gprsize+16*27, 16 3992 mova [rsp+gprsize+16*0 ], m7 3993 mova m7, [o(pw_8192)] 3994 lea tx2q, [o(.end7)] 3995 jmp m(idct_8x8_internal_8bpc).pass1_end1 3996 3997.end7: 3998 mov dstq, r3 3999 lea tx2q, [o(.end8)] 4000 jmp m(idct_8x8_internal_8bpc).pass2_main 4001 4002.end8: 4003 ret 4004 4005 4006cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 4007 mov r5d, 4 4008 mov tx2d, 2 4009 cmp eobd, 107 4010 cmovns tx2d, r5d 4011 mov r3d, tx2d 4012%if ARCH_X86_32 4013 LEA r5, $$ 4014%endif 4015 lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] 4016.loop: 4017 LOAD_8ROWS coeffq+16*0, 64 4018 paddsw m6, [o(pw_5)] 4019 mova [rsp+16*1], m6 4020 mova m6, [o(pw_5)] 4021 REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7 4022 call m(idct_8x8_internal_8bpc).pass1_end3 4023 REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 4024 mova [rsp+16*2], m5 4025 mova [rsp+16*1], m6 4026 mova [rsp+16*0], m7 4027 call m(idct_8x8_internal_8bpc).end3 4028 lea dstq, [dstq+strideq*2] 4029 pxor m7, m7 4030 REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 4031 add coeffq, 16 4032 dec r3d 4033 jg .loop 4034 RET 4035 4036cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 4037 mov r5d, 4 4038 mov tx2d, 2 4039 cmp eobd, 107 4040 cmovns tx2d, r5d 4041 mov r3d, tx2d 4042%if ARCH_X86_32 4043 LEA r5, $$ 4044%endif 4045 4046.loop: 4047 LOAD_8ROWS coeffq+16*0, 16 4048 pmulhrsw m6, [o(pw_4096)] 4049 mova [rsp+16*1], m6 4050 mova m6, [o(pw_4096)] 4051 REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 4052 lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] 4053 call m(idct_8x8_internal_8bpc).pass1_end3 4054 4055 mov [rsp+16*3], dstq 4056 mova [rsp+16*2], m5 4057 mova [rsp+16*1], m6 4058 mova [rsp+16*0], m7 4059 lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] 4060 call m(idct_8x8_internal_8bpc).end3 4061 4062 add coeffq, 16*8 4063 mov dstq, [rsp+16*3] 4064 lea dstq, [dstq+8] 4065 dec r3d 4066 jg .loop 4067 jnc .loop 4068 RET 4069 4070 4071cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 4072%if ARCH_X86_32 4073 LEA r5, $$ 4074%endif 4075 test eobd, eobd 4076 jz .dconly 4077 call m(idct_16x32_internal_8bpc) 4078.end: 4079 RET 4080 4081.dconly: 4082 movd m1, [o(pw_2896x8)] 4083 pmulhrsw m0, m1, [coeffq] 4084 movd m2, [o(pw_16384)] 4085 mov [coeffq], eobd 4086 pmulhrsw m0, m1 4087 mov r2d, 16 4088 lea tx2q, [o(.end)] 4089 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly 4090 4091 4092cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 4093 LOAD_8ROWS coeffq+16*1, 128, 1 4094 call m(idct_8x8_internal_8bpc).main 4095 SAVE_7ROWS rsp+gprsize+16*3, 16 4096 LOAD_8ROWS coeffq+16*5, 128, 1 4097 call m(idct_16x8_internal_8bpc).main 4098 lea tx2q, [o(.pass1_end)] 4099 jmp m(idct_8x8_internal_8bpc).pass1_end 4100 4101.pass1_end: 4102 SAVE_8ROWS coeffq+16*33, 64 ;in8~in15 4103 LOAD_8ROWS rsp+gprsize+16*3, 16 4104 mova [rsp+gprsize+16*0], m7 4105 lea tx2q, [o(.pass1_end1)] 4106 jmp m(idct_8x8_internal_8bpc).pass1_end 4107 4108.pass1_end1: 4109 mova [coeffq+16*1 ], m0 ;in8 4110 mova [coeffq+16*5 ], m4 ;in12 4111 mova [rsp+gprsize+16*13], m2 ;in10 4112 mova [rsp+gprsize+16*14], m6 ;in14 4113 mova [rsp+gprsize+16*21], m1 ;in9 4114 mova [rsp+gprsize+16*24], m3 ;in11 4115 mova [rsp+gprsize+16*25], m5 ;in13 4116 mova [rsp+gprsize+16*20], m7 ;in15 4117 LOAD_8ROWS coeffq+16*0, 128, 1 4118 call m(idct_8x8_internal_8bpc).main 4119 SAVE_7ROWS rsp+gprsize+16*3, 16 4120 LOAD_8ROWS coeffq+16*4, 128, 1 4121 call m(idct_16x8_internal_8bpc).main 4122 lea tx2q, [o(.pass1_end2)] 4123 jmp m(idct_8x8_internal_8bpc).pass1_end 4124 4125.pass1_end2: 4126 SAVE_8ROWS coeffq+16*32, 64 ;in0~in7 4127 LOAD_8ROWS rsp+gprsize+16*3, 16 4128 mova [rsp+gprsize+16*0], m7 4129 lea tx2q, [o(.pass1_end3)] 4130 jmp m(idct_8x8_internal_8bpc).pass1_end 4131 4132.pass1_end3: 4133 mova [rsp+gprsize+16*11], m2 ;in2 4134 mova [rsp+gprsize+16*12], m6 ;in6 4135 mova [rsp+gprsize+16*19], m1 ;in1 4136 mova [rsp+gprsize+16*26], m3 ;in3 4137 mova [rsp+gprsize+16*23], m5 ;in5 4138 mova [rsp+gprsize+16*22], m7 ;in7 4139 4140 cmp eobd, 150 4141 jg .full 4142 4143 mova m1, m4 ;in4 4144 mova m2, [coeffq+16*1 ] ;in8 4145 mova m3, [coeffq+16*5 ] ;in12 4146 pxor m4, m4 4147 REPX {mova x, m4}, m5, m6, m7 4148 call m(idct_8x8_internal_8bpc).main 4149 SAVE_7ROWS rsp+gprsize+16*3, 16 4150 mova m0, [rsp+gprsize+16*11] ;in2 4151 mova m1, [rsp+gprsize+16*12] ;in6 4152 mova m2, [rsp+gprsize+16*13] ;in10 4153 mova m3, [rsp+gprsize+16*14] ;in14 4154 pxor m4, m4 4155 REPX {mova x, m4}, m5, m6, m7 4156 call m(idct_16x8_internal_8bpc).main 4157 mova m7, [rsp+gprsize+16*0] 4158 SAVE_8ROWS rsp+gprsize+16*11, 16 4159 4160 call m(idct_8x32_internal_8bpc).main_fast 4161 jmp .pass2 4162 4163.full: 4164 mova [coeffq+16*0 ], m0 ;in0 4165 mova [coeffq+16*4 ], m4 ;in4 4166 4167 LOAD_8ROWS coeffq+16*2, 128, 1 4168 call m(idct_8x8_internal_8bpc).main 4169 SAVE_7ROWS rsp+gprsize+16*3, 16 4170 LOAD_8ROWS coeffq+16*6, 128, 1 4171 call m(idct_16x8_internal_8bpc).main 4172 lea tx2q, [o(.pass1_end4)] 4173 jmp m(idct_8x8_internal_8bpc).pass1_end 4174 4175.pass1_end4: 4176 SAVE_8ROWS coeffq+16*34, 64 ;in16~in23 4177 LOAD_8ROWS rsp+gprsize+16*3, 16 4178 mova [rsp+gprsize+16*0], m7 4179 lea tx2q, [o(.pass1_end5)] 4180 jmp m(idct_8x8_internal_8bpc).pass1_end 4181 4182.pass1_end5: 4183 mova [coeffq+16*2 ], m0 ;in16 4184 mova [coeffq+16*6 ], m4 ;in20 4185 mova [rsp+gprsize+16*15], m2 ;in18 4186 mova [rsp+gprsize+16*16], m6 ;in22 4187 mova [rsp+gprsize+16*33], m1 ;in17 4188 mova [rsp+gprsize+16*28], m3 ;in19 4189 mova [rsp+gprsize+16*29], m5 ;in21 4190 mova [rsp+gprsize+16*32], m7 ;in23 4191 4192 LOAD_8ROWS coeffq+16*3, 128, 1 4193 call m(idct_8x8_internal_8bpc).main 4194 SAVE_7ROWS rsp+gprsize+16*3, 16 4195 LOAD_8ROWS coeffq+16*7, 128, 1 4196 call m(idct_16x8_internal_8bpc).main 4197 lea tx2q, [o(.pass1_end6)] 4198 jmp m(idct_8x8_internal_8bpc).pass1_end 4199 4200.pass1_end6: 4201 SAVE_8ROWS coeffq+16*35, 64 ;in24~in31 4202 LOAD_8ROWS rsp+gprsize+16*3, 16 4203 mova [rsp+gprsize+16*0], m7 4204 lea tx2q, [o(.pass1_end7)] 4205 jmp m(idct_8x8_internal_8bpc).pass1_end 4206 4207.pass1_end7: 4208 mova [rsp+gprsize+16*17], m2 ;in26 4209 mova [rsp+gprsize+16*18], m6 ;in30 4210 mova [rsp+gprsize+16*31], m1 ;in25 4211 mova [rsp+gprsize+16*30], m3 ;in27 4212 mova [rsp+gprsize+16*27], m5 ;in29 4213 mova [rsp+gprsize+16*34], m7 ;in31 4214 4215 mova m6, m0 ;in24 4216 mova m7, m4 ;in28 4217 mova m0, [coeffq+16*0 ] ;in0 4218 mova m1, [coeffq+16*4 ] ;in4 4219 mova m2, [coeffq+16*1 ] ;in8 4220 mova m3, [coeffq+16*5 ] ;in12 4221 mova m4, [coeffq+16*2 ] ;in16 4222 mova m5, [coeffq+16*6 ] ;in20 4223 call m(idct_8x8_internal_8bpc).main 4224 SAVE_7ROWS rsp+gprsize+16*3 , 16 4225 LOAD_8ROWS rsp+gprsize+16*11, 16 4226 call m(idct_16x8_internal_8bpc).main 4227 mova m7, [rsp+gprsize+16*0] 4228 SAVE_8ROWS rsp+gprsize+16*11, 16 4229 4230 call m(idct_8x32_internal_8bpc).main 4231 4232.pass2: 4233 mov [rsp+gprsize*1+16*35], eobd 4234 lea r3, [dstq+8] 4235 mov [rsp+gprsize*2+16*35], r3 4236 lea r3, [o(.end)] 4237 jmp m(idct_8x32_internal_8bpc).end 4238 4239.end: 4240 mov dstq, [rsp+gprsize*2+16*35] 4241 mov eobd, [rsp+gprsize*1+16*35] 4242 add coeffq, 16*32 4243 4244 mova m0, [coeffq+16*4 ] ;in1 4245 mova m1, [coeffq+16*12] ;in3 4246 mova m2, [coeffq+16*20] ;in5 4247 mova m3, [coeffq+16*28] ;in7 4248 mova m4, [coeffq+16*5 ] ;in9 4249 mova m5, [coeffq+16*13] ;in11 4250 mova m6, [coeffq+16*21] ;in13 4251 mova m7, [coeffq+16*29] ;in15 4252 4253 mova [rsp+gprsize+16*19], m0 ;in1 4254 mova [rsp+gprsize+16*26], m1 ;in3 4255 mova [rsp+gprsize+16*23], m2 ;in5 4256 mova [rsp+gprsize+16*22], m3 ;in7 4257 mova [rsp+gprsize+16*21], m4 ;in9 4258 mova [rsp+gprsize+16*24], m5 ;in11 4259 mova [rsp+gprsize+16*25], m6 ;in13 4260 mova [rsp+gprsize+16*20], m7 ;in15 4261 4262 mova m0, [coeffq+16*0 ] ;in0 4263 mova m1, [coeffq+16*16] ;in4 4264 mova m2, [coeffq+16*1 ] ;in8 4265 mova m3, [coeffq+16*17] ;in12 4266 4267 cmp eobd, 150 4268 jg .full1 4269 4270 pxor m4, m4 4271 REPX {mova x, m4}, m5, m6, m7 4272 call m(idct_8x8_internal_8bpc).main 4273 SAVE_7ROWS rsp+gprsize+16*3, 16 4274 4275 mova m0, [coeffq+16*8 ] ;in2 4276 mova m1, [coeffq+16*24] ;in6 4277 mova m2, [coeffq+16*9 ] ;in10 4278 mova m3, [coeffq+16*25] ;in14 4279 pxor m4, m4 4280 REPX {mova x, m4}, m5, m6, m7 4281 call m(idct_16x8_internal_8bpc).main 4282 mova m7, [rsp+gprsize+16*0] 4283 SAVE_8ROWS rsp+gprsize+16*11, 16 4284 4285 call m(idct_8x32_internal_8bpc).main_fast 4286 jmp m(idct_8x32_internal_8bpc).pass2 4287 4288.full1: 4289 mova m4, [coeffq+16*2 ] ;in16 4290 mova m5, [coeffq+16*18] ;in20 4291 mova m6, [coeffq+16*3 ] ;in24 4292 mova m7, [coeffq+16*19] ;in26 4293 call m(idct_8x8_internal_8bpc).main 4294 SAVE_7ROWS rsp+gprsize+16*3, 16 4295 4296 mova m0, [coeffq+16*8 ] ;in2 4297 mova m1, [coeffq+16*24] ;in6 4298 mova m2, [coeffq+16*9 ] ;in10 4299 mova m3, [coeffq+16*25] ;in14 4300 mova m4, [coeffq+16*10] ;in18 4301 mova m5, [coeffq+16*26] ;in22 4302 mova m6, [coeffq+16*11] ;in26 4303 mova m7, [coeffq+16*27] ;in30 4304 call m(idct_16x8_internal_8bpc).main 4305 mova m7, [rsp+gprsize+16*0] 4306 SAVE_8ROWS rsp+gprsize+16*11, 16 4307 4308 mova m0, [coeffq+16*6 ] ;in17 4309 mova m1, [coeffq+16*14] ;in19 4310 mova m2, [coeffq+16*22] ;in21 4311 mova m3, [coeffq+16*30] ;in23 4312 mova m4, [coeffq+16*7 ] ;in25 4313 mova m5, [coeffq+16*15] ;in27 4314 mova m6, [coeffq+16*23] ;in29 4315 mova m7, [coeffq+16*31] ;in31 4316 4317 mova [rsp+gprsize+16*33], m0 ;in17 4318 mova [rsp+gprsize+16*28], m1 ;in19 4319 mova [rsp+gprsize+16*29], m2 ;in21 4320 mova [rsp+gprsize+16*32], m3 ;in23 4321 mova [rsp+gprsize+16*31], m4 ;in25 4322 mova [rsp+gprsize+16*30], m5 ;in27 4323 mova [rsp+gprsize+16*27], m6 ;in29 4324 mova [rsp+gprsize+16*34], m7 ;in31 4325 4326 call m(idct_8x32_internal_8bpc).main 4327 jmp m(idct_8x32_internal_8bpc).pass2 4328 4329 4330cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 4331%if ARCH_X86_32 4332 LEA r5, $$ 4333%endif 4334 test eobd, eobd 4335 jz .dconly 4336 4337 call m(idct_32x16_internal_8bpc) 4338 call m(idct_8x16_internal_8bpc).pass2 4339 4340 add coeffq, 16*16 4341 lea dstq, [r3+8] 4342 LOAD_8ROWS rsp+16*11, 16 4343 mova [rsp+16*0], m7 4344 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] 4345 call m(idct_8x8_internal_8bpc).pass1_end 4346 call m(idct_8x16_internal_8bpc).pass2 4347 4348 add coeffq, 16*16 4349 lea dstq, [r3+8] 4350 LOAD_8ROWS rsp+16*19, 16 4351 mova [rsp+16*0], m7 4352 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] 4353 call m(idct_8x8_internal_8bpc).pass1_end 4354 call m(idct_8x16_internal_8bpc).pass2 4355 4356 add coeffq, 16*16 4357 lea dstq, [r3+8] 4358 LOAD_8ROWS rsp+16*27, 16 4359 mova [rsp+16*0], m7 4360 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] 4361 call m(idct_8x8_internal_8bpc).pass1_end 4362 call m(idct_8x16_internal_8bpc).pass2 4363 RET 4364 4365.dconly: 4366 movd m1, [o(pw_2896x8)] 4367 pmulhrsw m0, m1, [coeffq] 4368 movd m2, [o(pw_16384)] 4369 mov [coeffq], eobd 4370 pmulhrsw m0, m1 4371 mov r3d, 16 4372 lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] 4373 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body 4374 4375 4376cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 4377 add coeffq, 16 4378 lea r3, [o(.pass1_end1)] 4379.pass1: 4380 LOAD_8ROWS coeffq+16*0, 128, 1 4381 call m(idct_8x8_internal_8bpc).main 4382 SAVE_7ROWS rsp+gprsize+16*3, 16 4383 4384 LOAD_8ROWS coeffq+16*4, 128, 1 4385 call m(idct_16x8_internal_8bpc).main 4386 mova m7, [rsp+gprsize+16*0] 4387 SAVE_8ROWS rsp+gprsize+16*11, 16 4388 4389 LOAD_8ROWS coeffq+16*2, 64, 1 4390 mova [rsp+gprsize+16*19], m0 ;in1 4391 mova [rsp+gprsize+16*26], m1 ;in3 4392 mova [rsp+gprsize+16*23], m2 ;in5 4393 mova [rsp+gprsize+16*22], m3 ;in7 4394 mova [rsp+gprsize+16*21], m4 ;in9 4395 mova [rsp+gprsize+16*24], m5 ;in11 4396 mova [rsp+gprsize+16*25], m6 ;in13 4397 mova [rsp+gprsize+16*20], m7 ;in15 4398 4399 LOAD_8ROWS coeffq+16*34, 64, 1 4400 mova [rsp+gprsize+16*33], m0 ;in17 4401 mova [rsp+gprsize+16*28], m1 ;in19 4402 mova [rsp+gprsize+16*29], m2 ;in21 4403 mova [rsp+gprsize+16*32], m3 ;in23 4404 mova [rsp+gprsize+16*31], m4 ;in25 4405 mova [rsp+gprsize+16*30], m5 ;in27 4406 mova [rsp+gprsize+16*27], m6 ;in29 4407 mova [rsp+gprsize+16*34], m7 ;in31 4408 call m(idct_8x32_internal_8bpc).main 4409 4410.pass1_end: 4411 mova [rsp+gprsize+16*0 ], m7 4412 mov tx2q, r3 4413 jmp m(idct_8x8_internal_8bpc).pass1_end 4414 4415.pass1_end1: 4416 SAVE_8ROWS coeffq+16*0, 32 4417 LOAD_8ROWS rsp+gprsize+16*11, 16 4418 mova [rsp+gprsize+16*0 ], m7 4419 lea tx2q, [o(.pass1_end2)] 4420 jmp m(idct_8x8_internal_8bpc).pass1_end 4421 4422.pass1_end2: 4423 SAVE_8ROWS coeffq+16*16, 32 4424 LOAD_8ROWS rsp+gprsize+16*19, 16 4425 mova [rsp+gprsize+16*0 ], m7 4426 lea tx2q, [o(.pass1_end3)] 4427 jmp m(idct_8x8_internal_8bpc).pass1_end 4428 4429.pass1_end3: 4430 SAVE_8ROWS coeffq+16*32, 32 4431 LOAD_8ROWS rsp+gprsize+16*27, 16 4432 mova [rsp+gprsize+16*0 ], m7 4433 lea tx2q, [o(.pass1_end4)] 4434 jmp m(idct_8x8_internal_8bpc).pass1_end 4435 4436.pass1_end4: 4437 SAVE_8ROWS coeffq+16*48, 32 4438 4439 sub coeffq, 16 4440 lea r3, [o(.end)] 4441 jmp .pass1 4442 4443.end: 4444 ret 4445 4446 4447cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 4448 mov r4d, eobd 4449 cmp eobd, 43 ;if (eob > 43) 4450 sbb r3d, r3d ; iteration_count++ 4451 cmp r4d, 150 ;if (eob > 150) 4452 sbb r3d, 0 ; iteration_count++ 4453 cmp r4d, 278 ;if (eob > 278) 4454 sbb r3d, -4 ; iteration_count++ 4455 4456%if ARCH_X86_32 4457 LEA r5, $$ 4458%endif 4459 lea r4, [dstq+8] 4460 mov [rsp+16*3], r4 4461 mov [rsp+gprsize+16*3], r3d 4462 mov [rsp+gprsize*2+16*3], coeffq 4463 4464.loop: 4465 LOAD_8ROWS coeffq, 64, 1 4466 mova [rsp+16*1], m6 4467 pxor m6, m6 4468 REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 4469 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] 4470 call m(idct_8x8_internal_8bpc).pass1_end3 4471 mova [rsp+16*0], m2 4472 mova [rsp+16*1], m3 4473 mova [rsp+16*2], m4 4474 mova m3, [o(pw_1697x16)] 4475 mova m4, [o(pw_16384)] 4476 REPX {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1 4477 mova m2, [o(pw_8192)] 4478 REPX {pmulhrsw x, m2}, m5, m6, m7, m0, m1 4479 mova m2, [rsp+16*0] 4480 mova [rsp+16*0], m7 4481 IDTX16 2, 7, 3, 4 4482 mova m7, [rsp+16*2] 4483 mova [rsp+16*2], m5 4484 IDTX16 7, 5, 3, 4 4485 mova m5, [rsp+16*1] 4486 mova [rsp+16*1], m6 4487 pmulhrsw m3, m5 4488 pmulhrsw m3, m4 4489 psrlw m4, 1 ; pw_8192 4490 paddsw m3, m5 4491 pmulhrsw m2, m4 4492 pmulhrsw m3, m4 4493 pmulhrsw m4, m7 4494 call m(idct_8x8_internal_8bpc).end3 4495 lea dstq, [dstq+strideq*2] 4496 add coeffq, 16 4497 dec r3d 4498 jg .loop 4499 mov coeffq, [rsp+gprsize*2+16*3] 4500 add coeffq, 64*8 4501 mov r3d, [rsp+gprsize+16*3] 4502 xor dstq, dstq 4503 mov [rsp+gprsize+16*3], dstq 4504 mov dstq, [rsp+16*3] 4505 test r3d, r3d 4506 jnz .loop 4507 RET 4508 4509 4510cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 4511 mov r4d, 12 ;0100b 4512 mov r5d, 136 ;1000 1000b 4513 cmp eobd, 44 ;if (eob > 43) 4514 cmovns r4d, r5d ; iteration_count+2 4515 cmp eobd, 151 ;if (eob > 150) 4516 mov r3d, 34952 ;1000 1000 1000 1000b 4517 cmovs r3d, r4d ; iteration_count += 4 4518 4519%if ARCH_X86_32 4520 LEA r5, $$ 4521%endif 4522 lea r4, [dstq+8] 4523 mov [rsp+16*3], r4 4524 4525.loop: 4526 LOAD_8ROWS coeffq, 32, 1 4527 REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 4528 mova [rsp+16*1], m6 4529 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] 4530 call m(idct_8x8_internal_8bpc).pass1_end3 4531 mova [rsp+16*1], m5 4532 mova [rsp+16*2], m6 4533 mova m6, [o(pw_1697x16)] 4534 REPX {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4 4535 pmulhrsw m7, [o(pw_2048)] 4536 mova m5, [rsp+16*1] 4537 mova [rsp+16*0], m7 4538 IDTX16 5, 7, 6 4539 mova m7, [rsp+16*2] 4540 IDTX16 7, 6, 6 4541 mova m6, [o(pw_2048)] 4542 REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 4543 mova [rsp+16*2], m5 4544 mova [rsp+16*1], m7 4545 call m(idct_8x8_internal_8bpc).end3 4546 lea dstq, [dstq+strideq*2] 4547 pxor m7, m7 4548 REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 4549 4550.loop_end: 4551 add coeffq, 16 4552 shr r3d, 2 4553 jz .ret 4554 test r3d, 2 4555 jnz .loop 4556 mov r4d, r3d 4557 and r4d, 1 4558 lea coeffq, [coeffq+r4*8+32*7] 4559 mov dstq, [rsp+16*3] 4560 lea r4, [dstq+8] 4561 mov [rsp+16*3], r4 4562 jmp .loop 4563 4564.ret: 4565 RET 4566 4567 4568cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 4569%if ARCH_X86_32 4570 LEA r5, $$ 4571%endif 4572 test eobd, eobd 4573 jz .dconly 4574 4575 call m(idct_32x32_internal_8bpc) 4576 RET 4577 4578.dconly: 4579 movd m1, [o(pw_2896x8)] 4580 pmulhrsw m0, m1, [coeffq] 4581 movd m2, [o(pw_8192)] 4582 mov [coeffq], eobd 4583 mov r3d, 32 4584 lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] 4585 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body 4586 4587 4588cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 4589 mov r4d, 2 4590 sub eobd, 136 4591 mov [rsp+gprsize*1+16*35], eobd 4592 mov r3d, 4 4593 cmovs r3d, r4d 4594 4595%if ARCH_X86_32 4596 LEA r5, $$ 4597%endif 4598 4599 mov [rsp+gprsize*2+16*35], coeffq 4600 4601.pass1_loop: 4602 LOAD_8ROWS coeffq+64*1, 64*2 4603 mova [rsp+gprsize+16*19], m0 ;in1 4604 mova [rsp+gprsize+16*26], m1 ;in3 4605 mova [rsp+gprsize+16*23], m2 ;in5 4606 mova [rsp+gprsize+16*22], m3 ;in7 4607 mova [rsp+gprsize+16*21], m4 ;in9 4608 mova [rsp+gprsize+16*24], m5 ;in11 4609 mova [rsp+gprsize+16*25], m6 ;in13 4610 mova [rsp+gprsize+16*20], m7 ;in15 4611 4612 mov tx2d, [rsp+gprsize*1+16*35] 4613 test tx2d, tx2d 4614 jl .fast 4615 4616.full: 4617 LOAD_8ROWS coeffq+64*0, 64*4 4618 call m(idct_8x8_internal_8bpc).main 4619 SAVE_7ROWS rsp+gprsize+16*3, 16 4620 LOAD_8ROWS coeffq+64*2, 64*4 4621 call m(idct_16x8_internal_8bpc).main 4622 mova m7, [rsp+gprsize+16*0] 4623 SAVE_8ROWS rsp+gprsize+16*11, 16 4624 4625 LOAD_8ROWS coeffq+64*17, 64*2 4626 mova [rsp+gprsize+16*33], m0 ;in17 4627 mova [rsp+gprsize+16*28], m1 ;in19 4628 mova [rsp+gprsize+16*29], m2 ;in21 4629 mova [rsp+gprsize+16*32], m3 ;in23 4630 mova [rsp+gprsize+16*31], m4 ;in25 4631 mova [rsp+gprsize+16*30], m5 ;in27 4632 mova [rsp+gprsize+16*27], m6 ;in29 4633 mova [rsp+gprsize+16*34], m7 ;in31 4634 4635 call m(idct_8x32_internal_8bpc).main 4636 jmp .pass1_end 4637 4638.fast: 4639 mova m0, [coeffq+256*0] 4640 mova m1, [coeffq+256*1] 4641 mova m2, [coeffq+256*2] 4642 mova m3, [coeffq+256*3] 4643 pxor m4, m4 4644 REPX {mova x, m4}, m5, m6, m7 4645 call m(idct_8x8_internal_8bpc).main 4646 4647 SAVE_7ROWS rsp+gprsize+16*3, 16 4648 mova m0, [coeffq+128*1] 4649 mova m1, [coeffq+128*3] 4650 mova m2, [coeffq+128*5] 4651 mova m3, [coeffq+128*7] 4652 pxor m4, m4 4653 REPX {mova x, m4}, m5, m6, m7 4654 call m(idct_16x8_internal_8bpc).main 4655 mova m7, [rsp+gprsize+16*0] 4656 SAVE_8ROWS rsp+gprsize+16*11, 16 4657 4658 call m(idct_8x32_internal_8bpc).main_fast 4659 4660.pass1_end: 4661 mova [rsp+gprsize+16*0], m7 4662 mova m7, [o(pw_8192)] 4663 lea tx2q, [o(.pass1_end1)] 4664 jmp m(idct_8x8_internal_8bpc).pass1_end1 4665 4666.pass1_end1: 4667 SAVE_8ROWS coeffq+64*0, 64 4668 LOAD_8ROWS rsp+gprsize+16*11, 16 4669 mova [rsp+gprsize+16*0], m7 4670 mova m7, [o(pw_8192)] 4671 lea tx2q, [o(.pass1_end2)] 4672 jmp m(idct_8x8_internal_8bpc).pass1_end1 4673 4674.pass1_end2: 4675 SAVE_8ROWS coeffq+64*8, 64 4676 LOAD_8ROWS rsp+gprsize+16*19, 16 4677 mova [rsp+gprsize+16*0], m7 4678 mova m7, [o(pw_8192)] 4679 lea tx2q, [o(.pass1_end3)] 4680 jmp m(idct_8x8_internal_8bpc).pass1_end1 4681 4682.pass1_end3: 4683 SAVE_8ROWS coeffq+64*16, 64 4684 LOAD_8ROWS rsp+gprsize+16*27, 16 4685 mova [rsp+gprsize+16*0], m7 4686 mova m7, [o(pw_8192)] 4687 lea tx2q, [o(.pass1_end4)] 4688 jmp m(idct_8x8_internal_8bpc).pass1_end1 4689 4690.pass1_end4: 4691 SAVE_8ROWS coeffq+64*24, 64 4692 4693 add coeffq, 16 4694 dec r3d 4695 jg .pass1_loop 4696 4697 4698.pass2: 4699 mov coeffq, [rsp+gprsize*2+16*35] 4700 mov r3d, 4 4701 lea tx2q, [o(.pass2_end)] 4702 4703.pass2_loop: 4704 mov [rsp+gprsize*3+16*35], r3d 4705 lea r3, [dstq+8] 4706 mov [rsp+gprsize*2+16*35], r3 4707 4708 mova m0, [coeffq+16*4 ] 4709 mova m1, [coeffq+16*12] 4710 mova m2, [coeffq+16*20] 4711 mova m3, [coeffq+16*28] 4712 mova m4, [coeffq+16*5 ] 4713 mova m5, [coeffq+16*13] 4714 mova m6, [coeffq+16*21] 4715 mova m7, [coeffq+16*29] 4716 mova [rsp+gprsize+16*19], m0 ;in1 4717 mova [rsp+gprsize+16*26], m1 ;in3 4718 mova [rsp+gprsize+16*23], m2 ;in5 4719 mova [rsp+gprsize+16*22], m3 ;in7 4720 mova [rsp+gprsize+16*21], m4 ;in9 4721 mova [rsp+gprsize+16*24], m5 ;in11 4722 mova [rsp+gprsize+16*25], m6 ;in13 4723 mova [rsp+gprsize+16*20], m7 ;in15 4724 4725 mov eobd, [rsp+gprsize*1+16*35] 4726 test eobd, eobd 4727 jl .fast1 4728 4729.full1: 4730 mova m0, [coeffq+16*0 ] 4731 mova m1, [coeffq+16*16] 4732 mova m2, [coeffq+16*1 ] 4733 mova m3, [coeffq+16*17] 4734 mova m4, [coeffq+16*2 ] 4735 mova m5, [coeffq+16*18] 4736 mova m6, [coeffq+16*3 ] 4737 mova m7, [coeffq+16*19] 4738 call m(idct_8x8_internal_8bpc).main 4739 SAVE_7ROWS rsp+gprsize+16*3, 16 4740 4741 mova m0, [coeffq+16*8 ] 4742 mova m1, [coeffq+16*24] 4743 mova m2, [coeffq+16*9 ] 4744 mova m3, [coeffq+16*25] 4745 mova m4, [coeffq+16*10] 4746 mova m5, [coeffq+16*26] 4747 mova m6, [coeffq+16*11] 4748 mova m7, [coeffq+16*27] 4749 call m(idct_16x8_internal_8bpc).main 4750 mova m7, [rsp+gprsize+16*0] 4751 SAVE_8ROWS rsp+gprsize+16*11, 16 4752 4753 mova m0, [coeffq+16*6 ] 4754 mova m1, [coeffq+16*14] 4755 mova m2, [coeffq+16*22] 4756 mova m3, [coeffq+16*30] 4757 mova m4, [coeffq+16*7 ] 4758 mova m5, [coeffq+16*15] 4759 mova m6, [coeffq+16*23] 4760 mova m7, [coeffq+16*31] 4761 mova [rsp+gprsize+16*33], m0 ;in17 4762 mova [rsp+gprsize+16*28], m1 ;in19 4763 mova [rsp+gprsize+16*29], m2 ;in21 4764 mova [rsp+gprsize+16*32], m3 ;in23 4765 mova [rsp+gprsize+16*31], m4 ;in25 4766 mova [rsp+gprsize+16*30], m5 ;in27 4767 mova [rsp+gprsize+16*27], m6 ;in29 4768 mova [rsp+gprsize+16*34], m7 ;in31 4769 4770 call m(idct_8x32_internal_8bpc).main 4771 jmp tx2q 4772 4773.fast1: 4774 mova m0, [coeffq+16*0 ] 4775 mova m1, [coeffq+16*16] 4776 mova m2, [coeffq+16*1 ] 4777 mova m3, [coeffq+16*17] 4778 pxor m4, m4 4779 REPX {mova x, m4}, m5, m6, m7 4780 call m(idct_8x8_internal_8bpc).main 4781 SAVE_7ROWS rsp+gprsize+16*3, 16 4782 4783 mova m0, [coeffq+16*8 ] 4784 mova m1, [coeffq+16*24] 4785 mova m2, [coeffq+16*9 ] 4786 mova m3, [coeffq+16*25] 4787 pxor m4, m4 4788 REPX {mova x, m4}, m5, m6, m7 4789 call m(idct_16x8_internal_8bpc).main 4790 mova m7, [rsp+gprsize+16*0] 4791 SAVE_8ROWS rsp+gprsize+16*11, 16 4792 4793 call m(idct_8x32_internal_8bpc).main_fast 4794 jmp tx2q 4795 4796.pass2_end: 4797 lea r3, [o(.pass2_end1)] 4798 jmp m(idct_8x32_internal_8bpc).end 4799 4800.pass2_end1: 4801 lea tx2q, [o(.pass2_end)] 4802 add coeffq, 16*32 4803 mov dstq, [rsp+gprsize*2+16*35] 4804 mov r3d, [rsp+gprsize*3+16*35] 4805 dec r3d 4806 jg .pass2_loop 4807 4808 ret 4809 4810 4811cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 4812 mov r4d, 2 4813 cmp eobd, 136 4814 mov r3d, 4 4815 cmovs r3d, r4d 4816 4817%if ARCH_X86_32 4818 LEA r5, $$ 4819%endif 4820 4821 lea r4, [dstq+8] 4822 mov [rsp+gprsize*0+16*3], r4 4823 mov [rsp+gprsize*1+16*3], r3d 4824 mov [rsp+gprsize*2+16*3], r3d 4825 mov [rsp+gprsize*3+16*3], coeffq 4826 4827.loop: 4828 LOAD_8ROWS coeffq, 64 4829 mova [rsp+16*1], m6 4830 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] 4831 call m(idct_8x8_internal_8bpc).pass1_end3 4832 pmulhrsw m7, [o(pw_8192)] 4833 mova [rsp+16*0], m7 4834 mova m7, [o(pw_8192)] 4835 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 4836 mova [rsp+16*1], m6 4837 mova [rsp+16*2], m5 4838 call m(idct_8x8_internal_8bpc).end3 4839 lea dstq, [dstq+strideq*2] 4840 4841 pxor m7, m7 4842 REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 4843 4844 add coeffq, 16 4845 dec r3d 4846 jg .loop 4847 4848 mov r4d, [rsp+gprsize*2+16*3] 4849 dec r4d 4850 jle .ret 4851 4852 mov dstq, [rsp+gprsize*0+16*3] 4853 mov coeffq, [rsp+gprsize*3+16*3] 4854 mov [rsp+gprsize*2+16*3], r4 4855 lea r3, [dstq+8] 4856 add coeffq, 64*8 4857 mov [rsp+gprsize*0+16*3], r3 4858 mov r3d, [rsp+gprsize*1+16*3] 4859 mov [rsp+gprsize*3+16*3], coeffq 4860 jmp .loop 4861 4862.ret: 4863 RET 4864 4865 4866cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 4867%if ARCH_X86_32 4868 LEA r5, $$ 4869%endif 4870 test eobd, eobd 4871 jz .dconly 4872 call m(idct_16x64_internal_8bpc) 4873.end: 4874 RET 4875 4876.dconly: 4877 movd m1, [o(pw_2896x8)] 4878 pmulhrsw m0, m1, [coeffq] 4879 movd m2, [o(pw_8192)] 4880 mov [coeffq], eobd 4881 mov r2d, 32 4882 lea tx2q, [o(.end)] 4883 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly 4884 4885 4886cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 4887 mov r4d, 2 4888 sub eobd, 151 4889 mov [rsp+gprsize*1+16*67], eobd 4890 mov r3d, 4 4891 cmovs r3d, r4d 4892 4893%if ARCH_X86_32 4894 LEA r5, $$ 4895%endif 4896 4897 mov [rsp+gprsize*2+16*67], coeffq 4898 4899.pass1_loop: 4900 LOAD_8ROWS coeffq+64*0, 64*2 4901 call m(idct_8x8_internal_8bpc).main 4902 SAVE_7ROWS rsp+gprsize+16*3, 16 4903 LOAD_8ROWS coeffq+64*1, 64*2 4904 call m(idct_16x8_internal_8bpc).main 4905 mova m7, [o(pw_8192)] 4906 lea tx2q, [o(.pass1_end)] 4907 jmp m(idct_8x8_internal_8bpc).pass1_end1 4908 4909.pass1_end: 4910 SAVE_8ROWS coeffq+64*8, 64 4911 LOAD_8ROWS rsp+gprsize+16*3, 16 4912 mova [rsp+gprsize+16*0], m7 4913 mova m7, [o(pw_8192)] 4914 lea tx2q, [o(.pass1_end1)] 4915 jmp m(idct_8x8_internal_8bpc).pass1_end1 4916 4917.pass1_end1: 4918 SAVE_8ROWS coeffq+64*0, 64 4919 4920 add coeffq, 16 4921 dec r3d 4922 jg .pass1_loop 4923 4924 mov coeffq, [rsp+gprsize*2+16*67] 4925 mov r3d, 2 4926 lea r4, [dstq+8] 4927 mov [rsp+gprsize*2+16*67], r4 4928 lea r4, [o(.end1)] 4929 4930.pass2_loop: 4931 mov [rsp+gprsize*3+16*67], r3d 4932 mov eobd, [rsp+gprsize*1+16*67] 4933 4934 mova m0, [coeffq+16*4 ] ;in1 4935 mova m1, [coeffq+16*12] ;in3 4936 mova m2, [coeffq+16*20] ;in5 4937 mova m3, [coeffq+16*28] ;in7 4938 mova m4, [coeffq+16*5 ] ;in9 4939 mova m5, [coeffq+16*13] ;in11 4940 mova m6, [coeffq+16*21] ;in13 4941 mova m7, [coeffq+16*29] ;in15 4942 mova [rsp+gprsize+16*35], m0 ;in1 4943 mova [rsp+gprsize+16*49], m1 ;in3 4944 mova [rsp+gprsize+16*43], m2 ;in5 4945 mova [rsp+gprsize+16*41], m3 ;in7 4946 mova [rsp+gprsize+16*39], m4 ;in9 4947 mova [rsp+gprsize+16*45], m5 ;in11 4948 mova [rsp+gprsize+16*47], m6 ;in13 4949 mova [rsp+gprsize+16*37], m7 ;in15 4950 4951 pxor m4, m4 4952 mova m0, [coeffq+16*0] 4953 mova m1, [coeffq+16*1] 4954 4955 test eobd, eobd 4956 jl .fast 4957 4958.full: 4959 mova m2, [coeffq+16*2] 4960 mova m3, [coeffq+16*3] 4961 4962 REPX {mova x, m4}, m5, m6, m7 4963 call m(idct_8x8_internal_8bpc).main 4964 SAVE_7ROWS rsp+gprsize+16*3, 16 4965 4966 pxor m4, m4 4967 mova m0, [coeffq+16*16] 4968 mova m1, [coeffq+16*17] 4969 mova m2, [coeffq+16*18] 4970 mova m3, [coeffq+16*19] 4971 4972 REPX {mova x, m4}, m5, m6, m7 4973 call m(idct_16x8_internal_8bpc).main 4974 mova m7, [rsp+gprsize+16*0] 4975 SAVE_8ROWS rsp+gprsize+16*11, 16 4976 4977 mova m0, [coeffq+16*8 ] 4978 mova m1, [coeffq+16*24] 4979 mova m2, [coeffq+16*9 ] 4980 mova m3, [coeffq+16*25] 4981 mova m4, [coeffq+16*10] 4982 mova m5, [coeffq+16*26] 4983 mova m6, [coeffq+16*11] 4984 mova m7, [coeffq+16*27] 4985 mova [rsp+gprsize+16*19], m0 4986 mova [rsp+gprsize+16*26], m1 4987 mova [rsp+gprsize+16*23], m2 4988 mova [rsp+gprsize+16*22], m3 4989 mova [rsp+gprsize+16*21], m4 4990 mova [rsp+gprsize+16*24], m5 4991 mova [rsp+gprsize+16*25], m6 4992 mova [rsp+gprsize+16*20], m7 4993 4994 call m(idct_8x32_internal_8bpc).main_fast 4995 SAVE_8ROWS rsp+gprsize+16*3, 16 4996 4997 mova m0, [coeffq+16*6 ] ;in17 4998 mova m1, [coeffq+16*14] ;in19 4999 mova m2, [coeffq+16*22] ;in21 5000 mova m3, [coeffq+16*30] ;in23 5001 mova m4, [coeffq+16*7 ] ;in25 5002 mova m5, [coeffq+16*15] ;in27 5003 mova m6, [coeffq+16*23] ;in29 5004 mova m7, [coeffq+16*31] ;in31 5005 mova [rsp+gprsize+16*63], m0 ;in17 5006 mova [rsp+gprsize+16*53], m1 ;in19 5007 mova [rsp+gprsize+16*55], m2 ;in21 5008 mova [rsp+gprsize+16*61], m3 ;in23 5009 mova [rsp+gprsize+16*59], m4 ;in25 5010 mova [rsp+gprsize+16*57], m5 ;in27 5011 mova [rsp+gprsize+16*51], m6 ;in29 5012 mova [rsp+gprsize+16*65], m7 ;in31 5013 5014 call .main 5015 jmp .end 5016 5017.fast: 5018 REPX {mova x, m4}, m2, m3, m5, m6, m7 5019 call m(idct_8x8_internal_8bpc).main 5020 SAVE_7ROWS rsp+gprsize+16*3, 16 5021 5022 pxor m4, m4 5023 mova m0, [coeffq+16*16] 5024 mova m1, [coeffq+16*17] 5025 5026 REPX {mova x, m4}, m2, m3, m5, m6, m7 5027 call m(idct_16x8_internal_8bpc).main 5028 mova m7, [rsp+gprsize+16*0] 5029 SAVE_8ROWS rsp+gprsize+16*11, 16 5030 5031 mova m0, [coeffq+16*8 ] 5032 mova m1, [coeffq+16*24] 5033 mova m2, [coeffq+16*9 ] 5034 mova m3, [coeffq+16*25] 5035 mova [rsp+gprsize+16*19], m0 ;in1 5036 mova [rsp+gprsize+16*26], m1 ;in3 5037 mova [rsp+gprsize+16*23], m2 ;in5 5038 mova [rsp+gprsize+16*22], m3 ;in7 5039 5040 call m(idct_8x32_internal_8bpc).main_veryfast 5041 SAVE_8ROWS rsp+gprsize+16*3, 16 5042 5043 call .main_fast 5044 5045.end: 5046 LOAD_8ROWS rsp+gprsize+16*3, 16 5047 mova [rsp+gprsize+16*0], m7 5048 mov r3, r4 5049 jmp m(idct_8x32_internal_8bpc).end2 5050 5051.end1: 5052 LOAD_8ROWS rsp+gprsize+16*35, 16 5053 lea dstq, [dstq+strideq*2] 5054 lea r3, [rsp+16*32+gprsize] 5055 call .write 5056 mov dstq, [rsp+gprsize*2+16*67] 5057 mov r3d, [rsp+gprsize*3+16*67] 5058 lea r4, [dstq+8] 5059 mov [rsp+gprsize*2+16*67], r4 5060 lea r4, [o(.end1)] 5061 5062 dec r3d 5063 jg .pass2_loop 5064 ret 5065.write: 5066 mova [r3+16*0], m7 5067 mov r4, -16*32 5068 pxor m7, m7 5069 sub coeffq, r4 5070.zero_loop: 5071 mova [coeffq+r4+16*0], m7 5072 mova [coeffq+r4+16*1], m7 5073 add r4, 16*2 5074 jl .zero_loop 5075 call .write_main2 5076 LOAD_8ROWS r3+16*11, 16 5077 call .write_main 5078 LOAD_8ROWS r3+16*19, 16 5079 call .write_main 5080 LOAD_8ROWS r3+16*27, 16 5081.write_main: 5082 mova [r3+16*0], m7 5083.write_main2: 5084 mova m7, [o(pw_2048)] 5085 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 5086 pmulhrsw m7, [r3+16*0] 5087 mova [r3+16*2], m5 5088 mova [r3+16*1], m6 5089 mova [r3+16*0], m7 5090 WRITE_8X4 0, 1, 2, 3, 5, 6, 7 5091 lea dstq, [dstq+strideq*2] 5092 WRITE_8X4 4, [r3+16*2], [r3+16*1], [r3+16*0], 5, 6, 7 5093 lea dstq, [dstq+strideq*2] 5094 ret 5095 5096 5097ALIGN function_align 5098cglobal_label .main_fast 5099 mova m0, [rsp+gprsize*2+16*35] ;in1 5100 pmulhrsw m3, m0, [o(pw_4095x8)] ;t62,t63 5101 pmulhrsw m0, [o(pw_101x8)] ;t32,t33 5102 mova m7, [o(pd_2048)] 5103 mova [rsp+gprsize*2+16*35], m0 ;t32 5104 mova [rsp+gprsize*2+16*66], m3 ;t63 5105 ITX_MULSUB_2W 3, 0, 1, 2, 7, 401, 4076 ;t33a, t62a 5106 mova [rsp+gprsize*2+16*36], m3 ;t33a 5107 mova [rsp+gprsize*2+16*65], m0 ;t62a 5108 5109 mova m1, [rsp+gprsize*2+16*37] ;in15 5110 pmulhrsw m2, m1, [o(pw_3822x8)] ;t60,t61 5111 pmulhrsw m1, [o(pw_m1474x8)] ;t34,t35 5112 mova [rsp+gprsize*2+16*38], m1 ;t35 5113 mova [rsp+gprsize*2+16*63], m2 ;t60 5114 ITX_MULSUB_2W 2, 1, 0, 3, 7, m4076, 401 ;t34a, t61a 5115 mova [rsp+gprsize*2+16*37], m2 ;t34a 5116 mova [rsp+gprsize*2+16*64], m1 ;t61a 5117 5118 mova m0, [rsp+gprsize*2+16*39] ;in9 5119 pmulhrsw m3, m0, [o(pw_3996x8)] ;t58,t59 5120 pmulhrsw m0, [o(pw_897x8)] ;t36,t37 5121 mova [rsp+gprsize*2+16*39], m0 ;t36 5122 mova [rsp+gprsize*2+16*62], m3 ;t59 5123 ITX_MULSUB_2W 3, 0, 1, 2, 7, 3166, 2598 ;t37a, t58a 5124 mova [rsp+gprsize*2+16*40], m3 ;t37a 5125 mova [rsp+gprsize*2+16*61], m0 ;t58a 5126 5127 mova m1, [rsp+gprsize*2+16*41] ;in7 5128 pmulhrsw m2, m1, [o(pw_4036x8)] ;t56,t57 5129 pmulhrsw m1, [o(pw_m700x8)] ;t38,t39 5130 mova [rsp+gprsize*2+16*42], m1 ;t39 5131 mova [rsp+gprsize*2+16*59], m2 ;t56 5132 ITX_MULSUB_2W 2, 1, 0, 3, 7, m2598, 3166 ;t38a, t57a 5133 mova [rsp+gprsize*2+16*41], m2 ;t38a 5134 mova [rsp+gprsize*2+16*60], m1 ;t57a 5135 5136 mova m0, [rsp+gprsize*2+16*43] ;in5 5137 pmulhrsw m3, m0, [o(pw_4065x8)] ;t54,t55 5138 pmulhrsw m0, [o(pw_501x8)] ;t40,t41 5139 mova [rsp+gprsize*2+16*43], m0 ;t40 5140 mova [rsp+gprsize*2+16*58], m3 ;t55 5141 ITX_MULSUB_2W 3, 0, 1, 2, 7, 1931, 3612 ;t41a, t54a 5142 mova [rsp+gprsize*2+16*44], m3 ;t41a 5143 mova [rsp+gprsize*2+16*57], m0 ;t54a 5144 5145 mova m1, [rsp+gprsize*2+16*45] ;in11 5146 pmulhrsw m2, m1, [o(pw_3948x8)] ;t52,t53 5147 pmulhrsw m1, [o(pw_m1092x8)] ;t42,t43 5148 mova [rsp+gprsize*2+16*46], m1 ;t43 5149 mova [rsp+gprsize*2+16*55], m2 ;t52 5150 ITX_MULSUB_2W 2, 1, 0, 3, 7, m3612, 1931 ;t42a, t53a 5151 mova [rsp+gprsize*2+16*45], m2 ;t42a 5152 mova [rsp+gprsize*2+16*56], m1 ;t53a 5153 5154 mova m0, [rsp+gprsize*2+16*47] ;in13 5155 pmulhrsw m3, m0, [o(pw_3889x8)] ;t50,t51 5156 pmulhrsw m0, [o(pw_1285x8)] ;t44,t45 5157 mova m6, m0 5158 mova [rsp+gprsize*2+16*54], m3 ;t51 5159 ITX_MULSUB_2W 3, 0, 1, 2, 7, 3920, 1189 ;t45a, t50a 5160 mova [rsp+gprsize*2+16*48], m3 ;t45a 5161 mova [rsp+gprsize*2+16*53], m0 ;t50a 5162 5163 mova m0, [rsp+gprsize*2+16*49] ;in3 5164 pmulhrsw m3, m0, [o(pw_4085x8)] ;t48,t49 5165 pmulhrsw m0, [o(pw_m301x8)] ;t46,t47 5166 mova m4, m3 5167 mova m5, m0 5168 5169 jmp .main2 5170 5171ALIGN function_align 5172cglobal_label .main 5173 mova m0, [rsp+gprsize*2+16*35] ;in1 5174 mova m1, [rsp+gprsize*2+16*65] ;in31 5175 pmulhrsw m3, m0, [o(pw_4095x8)] ;t63a 5176 pmulhrsw m0, [o(pw_101x8)] ;t32a 5177 pmulhrsw m2, m1, [o(pw_2967x8)] ;t62a 5178 pmulhrsw m1, [o(pw_m2824x8)] ;t33a 5179 mova m7, [o(pd_2048)] 5180 psubsw m4, m0, m1 ;t33 5181 paddsw m0, m1 ;t32 5182 psubsw m5, m3, m2 ;t62 5183 paddsw m3, m2 ;t63 5184 ITX_MULSUB_2W 5, 4, 1, 2, 7, 401, 4076 ;t33a, t62a 5185 mova [rsp+gprsize*2+16*35], m0 ;t32 5186 mova [rsp+gprsize*2+16*36], m5 ;t33a 5187 mova [rsp+gprsize*2+16*65], m4 ;t62a 5188 mova [rsp+gprsize*2+16*66], m3 ;t63 5189 5190 mova m0, [rsp+gprsize*2+16*63] ;in17 5191 mova m1, [rsp+gprsize*2+16*37] ;in15 5192 pmulhrsw m3, m0, [o(pw_3745x8)] ;t61a 5193 pmulhrsw m0, [o(pw_1660x8)] ;t34a 5194 pmulhrsw m2, m1, [o(pw_3822x8)] ;t60a 5195 pmulhrsw m1, [o(pw_m1474x8)] ;t35a 5196 psubsw m4, m1, m0 ;t34 5197 paddsw m0, m1 ;t35 5198 psubsw m5, m2, m3 ;t61 5199 paddsw m3, m2 ;t60 5200 ITX_MULSUB_2W 5, 4, 1, 2, 7, m4076, 401 ;t34a, t61a 5201 mova [rsp+gprsize*2+16*37], m5 ;t34a 5202 mova [rsp+gprsize*2+16*38], m0 ;t35 5203 mova [rsp+gprsize*2+16*63], m3 ;t60 5204 mova [rsp+gprsize*2+16*64], m4 ;t61a 5205 5206 mova m0, [rsp+gprsize*2+16*39] ;in9 5207 mova m1, [rsp+gprsize*2+16*61] ;in23 5208 pmulhrsw m3, m0, [o(pw_3996x8)] ;t59a 5209 pmulhrsw m0, [o(pw_897x8)] ;t36a 5210 pmulhrsw m2, m1, [o(pw_3461x8)] ;t58a 5211 pmulhrsw m1, [o(pw_m2191x8)] ;t37a 5212 psubsw m4, m0, m1 ;t37 5213 paddsw m0, m1 ;t36 5214 psubsw m5, m3, m2 ;t58 5215 paddsw m3, m2 ;t59 5216 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3166, 2598 ;t37a, t58a 5217 mova [rsp+gprsize*2+16*39], m0 ;t36 5218 mova [rsp+gprsize*2+16*40], m5 ;t37a 5219 mova [rsp+gprsize*2+16*61], m4 ;t58a 5220 mova [rsp+gprsize*2+16*62], m3 ;t59 5221 5222 mova m0, [rsp+gprsize*2+16*59] ;in25 5223 mova m1, [rsp+gprsize*2+16*41] ;in7 5224 pmulhrsw m3, m0, [o(pw_3349x8)] ;t57a 5225 pmulhrsw m0, [o(pw_2359x8)] ;t38a 5226 pmulhrsw m2, m1, [o(pw_4036x8)] ;t56a 5227 pmulhrsw m1, [o(pw_m700x8)] ;t39a 5228 psubsw m4, m1, m0 ;t38 5229 paddsw m0, m1 ;t39 5230 psubsw m5, m2, m3 ;t57 5231 paddsw m3, m2 ;t56 5232 ITX_MULSUB_2W 5, 4, 1, 2, 7, m2598, 3166 ;t38a, t57a 5233 mova [rsp+gprsize*2+16*41], m5 ;t38a 5234 mova [rsp+gprsize*2+16*42], m0 ;t39 5235 mova [rsp+gprsize*2+16*59], m3 ;t56 5236 mova [rsp+gprsize*2+16*60], m4 ;t57a 5237 5238 mova m0, [rsp+gprsize*2+16*43] ;in5 5239 mova m1, [rsp+gprsize*2+16*57] ;in27 5240 pmulhrsw m3, m0, [o(pw_4065x8)] ;t55a 5241 pmulhrsw m0, [o(pw_501x8)] ;t40a 5242 pmulhrsw m2, m1, [o(pw_3229x8)] ;t54a 5243 pmulhrsw m1, [o(pw_m2520x8)] ;t41a 5244 psubsw m4, m0, m1 ;t41 5245 paddsw m0, m1 ;t40 5246 psubsw m5, m3, m2 ;t54 5247 paddsw m3, m2 ;t55 5248 ITX_MULSUB_2W 5, 4, 1, 2, 7, 1931, 3612 ;t41a, t54a 5249 mova [rsp+gprsize*2+16*43], m0 ;t40 5250 mova [rsp+gprsize*2+16*44], m5 ;t41a 5251 mova [rsp+gprsize*2+16*57], m4 ;t54a 5252 mova [rsp+gprsize*2+16*58], m3 ;t55 5253 5254 mova m0, [rsp+gprsize*2+16*55] ;in21 5255 mova m1, [rsp+gprsize*2+16*45] ;in11 5256 pmulhrsw m3, m0, [o(pw_3564x8)] ;t53a 5257 pmulhrsw m0, [o(pw_2019x8)] ;t42a 5258 pmulhrsw m2, m1, [o(pw_3948x8)] ;t52a 5259 pmulhrsw m1, [o(pw_m1092x8)] ;t43a 5260 psubsw m4, m1, m0 ;t42 5261 paddsw m0, m1 ;t43 5262 psubsw m5, m2, m3 ;t53 5263 paddsw m3, m2 ;t52 5264 ITX_MULSUB_2W 5, 4, 1, 2, 7, m3612, 1931 ;t42a, t53a 5265 mova [rsp+gprsize*2+16*45], m5 ;t42a 5266 mova [rsp+gprsize*2+16*46], m0 ;t43 5267 mova [rsp+gprsize*2+16*55], m3 ;t52 5268 mova [rsp+gprsize*2+16*56], m4 ;t53a 5269 5270 mova m0, [rsp+gprsize*2+16*47] ;in13 5271 mova m1, [rsp+gprsize*2+16*53] ;in19 5272 pmulhrsw m3, m0, [o(pw_3889x8)] ;t51a 5273 pmulhrsw m0, [o(pw_1285x8)] ;t44a 5274 pmulhrsw m2, m1, [o(pw_3659x8)] ;t50a 5275 pmulhrsw m1, [o(pw_m1842x8)] ;t45a 5276 psubsw m4, m0, m1 ;t45 5277 paddsw m0, m1 ;t44 5278 psubsw m5, m3, m2 ;t50 5279 paddsw m3, m2 ;t51 5280 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3920, 1189 ;t45a, t50a 5281 mova m6, m0 5282 mova [rsp+gprsize*2+16*48], m5 ;t45a 5283 mova [rsp+gprsize*2+16*53], m4 ;t50a 5284 mova [rsp+gprsize*2+16*54], m3 ;t51 5285 5286 mova m0, [rsp+gprsize*2+16*51] ;in29 5287 mova m1, [rsp+gprsize*2+16*49] ;in3 5288 pmulhrsw m3, m0, [o(pw_3102x8)] ;t49a 5289 pmulhrsw m0, [o(pw_2675x8)] ;t46a 5290 pmulhrsw m2, m1, [o(pw_4085x8)] ;t48a 5291 pmulhrsw m1, [o(pw_m301x8)] ;t47a 5292 psubsw m5, m1, m0 ;t46 5293 paddsw m0, m1 ;t47 5294 psubsw m4, m2, m3 ;t49 5295 paddsw m3, m2 ;t48 5296 5297ALIGN function_align 5298.main2: 5299 ITX_MULSUB_2W 4, 5, 1, 2, 7, m1189, 3920 ;t46a, t49a 5300 mova m1, [rsp+gprsize*2+16*54] ;t51 5301 psubsw m2, m0, m6 ;t44a 5302 paddsw m0, m6 ;t47a 5303 psubsw m6, m3, m1 ;t51a 5304 paddsw m3, m1 ;t48a 5305 mova [rsp+gprsize*2+16*50], m0 ;t47a 5306 mova [rsp+gprsize*2+16*51], m3 ;t48a 5307 ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t44, t51 5308 mova [rsp+gprsize*2+16*47], m6 ;t44 5309 mova [rsp+gprsize*2+16*54], m2 ;t51 5310 5311 mova m0, [rsp+gprsize*2+16*48] ;t45a 5312 mova m3, [rsp+gprsize*2+16*53] ;t50a 5313 psubsw m2, m4, m0 ;t45 5314 paddsw m4, m0 ;t46 5315 psubsw m6, m5, m3 ;t50 5316 paddsw m5, m3 ;t49 5317 ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t45a, t50a 5318 mova [rsp+gprsize*2+16*48], m6 ;t45a 5319 mova [rsp+gprsize*2+16*49], m4 ;t46 5320 mova [rsp+gprsize*2+16*52], m5 ;t49 5321 mova [rsp+gprsize*2+16*53], m2 ;t50a 5322 5323 mova m0, [rsp+gprsize*2+16*43] ;t40 5324 mova m2, [rsp+gprsize*2+16*46] ;t43 5325 mova m3, [rsp+gprsize*2+16*55] ;t52 5326 mova m1, [rsp+gprsize*2+16*58] ;t55 5327 psubsw m4, m0, m2 ;t43a 5328 paddsw m0, m2 ;t40a 5329 psubsw m5, m1, m3 ;t52a 5330 paddsw m1, m3 ;t55a 5331 ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t43, t52 5332 mova [rsp+gprsize*2+16*43], m0 ;t40a 5333 mova [rsp+gprsize*2+16*46], m5 ;t43 5334 mova [rsp+gprsize*2+16*55], m4 ;t52 5335 mova [rsp+gprsize*2+16*58], m1 ;t55a 5336 5337 mova m0, [rsp+gprsize*2+16*44] ;t41a 5338 mova m2, [rsp+gprsize*2+16*45] ;t42a 5339 mova m3, [rsp+gprsize*2+16*56] ;t53a 5340 mova m1, [rsp+gprsize*2+16*57] ;t54a 5341 psubsw m4, m0, m2 ;t42 5342 paddsw m0, m2 ;t41 5343 psubsw m5, m1, m3 ;t53 5344 paddsw m1, m3 ;t54 5345 ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t42a, t53a 5346 mova [rsp+gprsize*2+16*44], m0 ;t41 5347 mova [rsp+gprsize*2+16*45], m5 ;t42a 5348 mova [rsp+gprsize*2+16*56], m4 ;t53a 5349 mova [rsp+gprsize*2+16*57], m1 ;t54 5350 5351 mova m0, [rsp+gprsize*2+16*41] ;t38a 5352 mova m2, [rsp+gprsize*2+16*40] ;t37a 5353 mova m3, [rsp+gprsize*2+16*61] ;t58a 5354 mova m1, [rsp+gprsize*2+16*60] ;t57a 5355 psubsw m4, m0, m2 ;t37 5356 paddsw m0, m2 ;t38 5357 psubsw m5, m1, m3 ;t58 5358 paddsw m1, m3 ;t57 5359 ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t37a, t58a 5360 mova [rsp+gprsize*2+16*41], m0 ;t38 5361 mova [rsp+gprsize*2+16*40], m5 ;t37a 5362 mova [rsp+gprsize*2+16*61], m4 ;t58a 5363 mova [rsp+gprsize*2+16*60], m1 ;t57 5364 5365 mova m0, [rsp+gprsize*2+16*42] ;t39 5366 mova m2, [rsp+gprsize*2+16*39] ;t36 5367 mova m3, [rsp+gprsize*2+16*62] ;t59 5368 mova m1, [rsp+gprsize*2+16*59] ;t56 5369 psubsw m4, m0, m2 ;t36a 5370 paddsw m0, m2 ;t39a 5371 psubsw m5, m1, m3 ;t59a 5372 paddsw m1, m3 ;t56a 5373 ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t36, t59 5374 mova [rsp+gprsize*2+16*42], m0 ;t39a 5375 mova [rsp+gprsize*2+16*39], m5 ;t36 5376 mova [rsp+gprsize*2+16*62], m4 ;t59 5377 mova [rsp+gprsize*2+16*59], m1 ;t56a 5378 5379 mova m0, [rsp+gprsize*2+16*35] ;t32 5380 mova m2, [rsp+gprsize*2+16*38] ;t35 5381 mova m3, [rsp+gprsize*2+16*63] ;t60 5382 mova m1, [rsp+gprsize*2+16*66] ;t63 5383 psubsw m4, m0, m2 ;t35a 5384 paddsw m0, m2 ;t32a 5385 psubsw m5, m1, m3 ;t60a 5386 paddsw m1, m3 ;t63a 5387 ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t35, t60 5388 mova [rsp+gprsize*2+16*35], m0 ;t32a 5389 mova [rsp+gprsize*2+16*38], m5 ;t35 5390 mova [rsp+gprsize*2+16*63], m4 ;t60 5391 mova [rsp+gprsize*2+16*66], m1 ;t63a 5392 5393 mova m0, [rsp+gprsize*2+16*36] ;t33a 5394 mova m2, [rsp+gprsize*2+16*37] ;t34a 5395 mova m3, [rsp+gprsize*2+16*64] ;t61a 5396 mova m1, [rsp+gprsize*2+16*65] ;t62a 5397 psubsw m4, m0, m2 ;t34 5398 paddsw m0, m2 ;t33 5399 psubsw m5, m1, m3 ;t61 5400 paddsw m1, m3 ;t62 5401 ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t34a, t61a 5402 5403 mova m2, [rsp+gprsize*2+16*41] ;t38 5404 mova m3, [rsp+gprsize*2+16*60] ;t57 5405 psubsw m6, m0, m2 ;t38a 5406 paddsw m0, m2 ;t33a 5407 psubsw m2, m1, m3 ;t57a 5408 paddsw m1, m3 ;t62a 5409 mova [rsp+gprsize*2+16*36], m0 ;t33a 5410 mova [rsp+gprsize*2+16*65], m1 ;t62a 5411 ITX_MULSUB_2W 2, 6, 0, 3, 7, 1567, 3784 ;t38, t57 5412 mova [rsp+gprsize*2+16*41], m2 ;t38 5413 mova [rsp+gprsize*2+16*60], m6 ;t57 5414 5415 mova m2, [rsp+gprsize*2+16*40] ;t37 5416 mova m3, [rsp+gprsize*2+16*61] ;t58 5417 psubsw m0, m5, m2 ;t37 5418 paddsw m5, m2 ;t34 5419 psubsw m1, m4, m3 ;t58 5420 paddsw m4, m3 ;t61 5421 ITX_MULSUB_2W 1, 0, 2, 3, 7, 1567, 3784 ;t37a, t58a 5422 mova [rsp+gprsize*2+16*37], m5 ;t34 5423 mova [rsp+gprsize*2+16*64], m4 ;t61 5424 mova [rsp+gprsize*2+16*40], m1 ;t37a 5425 mova [rsp+gprsize*2+16*61], m0 ;t58a 5426 5427 mova m0, [rsp+gprsize*2+16*38] ;t35 5428 mova m2, [rsp+gprsize*2+16*39] ;t36 5429 mova m3, [rsp+gprsize*2+16*62] ;t59 5430 mova m1, [rsp+gprsize*2+16*63] ;t60 5431 psubsw m4, m0, m2 ;t36a 5432 paddsw m0, m2 ;t35a 5433 psubsw m5, m1, m3 ;t59a 5434 paddsw m1, m3 ;t60a 5435 ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t36, t59 5436 mova [rsp+gprsize*2+16*38], m0 ;t35a 5437 mova [rsp+gprsize*2+16*39], m5 ;t36 5438 mova [rsp+gprsize*2+16*62], m4 ;t59 5439 mova [rsp+gprsize*2+16*63], m1 ;t60a 5440 5441 mova m0, [rsp+gprsize*2+16*35] ;t32a 5442 mova m2, [rsp+gprsize*2+16*42] ;t39a 5443 mova m3, [rsp+gprsize*2+16*59] ;t56a 5444 mova m1, [rsp+gprsize*2+16*66] ;t63a 5445 psubsw m4, m0, m2 ;t39 5446 paddsw m0, m2 ;t32 5447 psubsw m5, m1, m3 ;t56 5448 paddsw m1, m3 ;t63 5449 ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t39a, t56a 5450 mova [rsp+gprsize*2+16*35], m0 ;t32 5451 mova [rsp+gprsize*2+16*42], m5 ;t39a 5452 mova [rsp+gprsize*2+16*59], m4 ;t56a 5453 mova [rsp+gprsize*2+16*66], m1 ;t63 5454 5455 mova m0, [rsp+gprsize*2+16*50] ;t47a 5456 mova m2, [rsp+gprsize*2+16*43] ;t40a 5457 mova m3, [rsp+gprsize*2+16*58] ;t55a 5458 mova m1, [rsp+gprsize*2+16*51] ;t48a 5459 psubsw m4, m0, m2 ;t40 5460 paddsw m0, m2 ;t47 5461 psubsw m5, m1, m3 ;t55 5462 paddsw m1, m3 ;t48 5463 ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t40a, t55a 5464 mova [rsp+gprsize*2+16*50], m0 ;t47 5465 mova [rsp+gprsize*2+16*43], m5 ;t40a 5466 mova [rsp+gprsize*2+16*58], m4 ;t55a 5467 mova [rsp+gprsize*2+16*51], m1 ;t48 5468 5469 mova m0, [rsp+gprsize*2+16*49] ;t46 5470 mova m2, [rsp+gprsize*2+16*44] ;t41 5471 mova m3, [rsp+gprsize*2+16*57] ;t54 5472 mova m1, [rsp+gprsize*2+16*52] ;t49 5473 psubsw m4, m0, m2 ;t41a 5474 paddsw m0, m2 ;t46a 5475 psubsw m5, m1, m3 ;t54a 5476 paddsw m1, m3 ;t49a 5477 ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t41, t54 5478 mova [rsp+gprsize*2+16*49], m0 ;t46a 5479 mova [rsp+gprsize*2+16*44], m5 ;t41 5480 mova [rsp+gprsize*2+16*57], m4 ;t54 5481 mova [rsp+gprsize*2+16*52], m1 ;t49a 5482 5483 mova m0, [rsp+gprsize*2+16*48] ;t45a 5484 mova m2, [rsp+gprsize*2+16*45] ;t42a 5485 mova m3, [rsp+gprsize*2+16*56] ;t53a 5486 mova m1, [rsp+gprsize*2+16*53] ;t50a 5487 psubsw m4, m0, m2 ;t42 5488 paddsw m0, m2 ;t45 5489 psubsw m5, m1, m3 ;t53 5490 paddsw m1, m3 ;t50 5491 ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t42a, t53a 5492 mova [rsp+gprsize*2+16*48], m0 ;t45 5493 mova [rsp+gprsize*2+16*45], m5 ;t42a 5494 mova [rsp+gprsize*2+16*56], m4 ;t53a 5495 mova [rsp+gprsize*2+16*53], m1 ;t50 5496 5497 mova m0, [rsp+gprsize*2+16*47] ;t44 5498 mova m2, [rsp+gprsize*2+16*46] ;t43 5499 mova m3, [rsp+gprsize*2+16*55] ;t52 5500 mova m1, [rsp+gprsize*2+16*54] ;t51 5501 psubsw m4, m0, m2 ;t43a 5502 paddsw m0, m2 ;t44a 5503 psubsw m5, m1, m3 ;t52a 5504 paddsw m1, m3 ;t51a 5505 ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t43, t52 5506 5507 mova m2, [rsp+gprsize*2+16*38] ;t35a 5508 mova m3, [rsp+gprsize*2+16*31] ;tmp[28] 5509 psubsw m6, m2, m0 ;t44 5510 paddsw m2, m0 ;t35 5511 psubsw m0, m3, m2 ;out35 5512 paddsw m2, m3 ;out28 5513 mova m3, [rsp+gprsize*2+16*63] ;t60a 5514 mova [rsp+gprsize*2+16*38], m0 ;out35 5515 mova [rsp+gprsize*2+16*31], m2 ;out28 5516 psubsw m0, m3, m1 ;t51 5517 paddsw m3, m1 ;t60 5518 ITX_MULSUB_2W 0, 6, 1, 2, 7, 2896, 2896 ;t44a, t51a 5519 mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3] 5520 psubsw m1, m2, m3 ;out60 5521 paddsw m2, m3 ;out3 5522 mova m3, [rsp+gprsize*2+16*22] ;tmp[19] 5523 mova [rsp+gprsize*2+16*63], m1 ;out60 5524 mova [rsp+gprsize*2+16*6 ], m2 ;out3 5525 psubsw m1, m3, m0 ;out44 5526 paddsw m3, m0 ;out19 5527 mova m2, [rsp+gprsize*2+16*15] ;tmp[12] 5528 5529 mova m0, [rsp+gprsize*2+16*39] ;t36 5530 mova [rsp+gprsize*2+16*47], m1 ;out44 5531 mova [rsp+gprsize*2+16*22], m3 ;out19 5532 mova m1, [rsp+gprsize*2+16*62] ;t59 5533 psubsw m3, m2, m6 ;out51 5534 paddsw m2, m6 ;out12 5535 mova [rsp+gprsize*2+16*54], m3 ;out51 5536 mova [rsp+gprsize*2+16*15], m2 ;out12 5537 psubsw m2, m0, m5 ;t43a 5538 paddsw m0, m5 ;t36a 5539 mova m5, [rsp+gprsize*2+16*30] ;tmp[27] 5540 psubsw m3, m1, m4 ;t52a 5541 paddsw m1, m4 ;t59a 5542 ITX_MULSUB_2W 3, 2, 4, 6, 7, 2896, 2896 ;t43, t52 5543 mova m4, [rsp+gprsize*2+16*7 ] ;tmp[4 ] 5544 psubsw m6, m5, m0 ;out36 5545 paddsw m5, m0 ;out27 5546 psubsw m0, m4, m1 ;out59 5547 paddsw m4, m1 ;out4 5548 mova [rsp+gprsize*2+16*39], m6 ;out36 5549 mova [rsp+gprsize*2+16*30], m5 ;out27 5550 mova [rsp+gprsize*2+16*62], m0 ;out59 5551 mova [rsp+gprsize*2+16*7 ], m4 ;out4 5552 mova m0, [rsp+gprsize*2+16*23] ;tmp[20] 5553 mova m5, [rsp+gprsize*2+16*14] ;tmp[11] 5554 psubsw m4, m0, m3 ;out43 5555 paddsw m0, m3 ;out20 5556 psubsw m6, m5, m2 ;out52 5557 paddsw m5, m2 ;out11 5558 mova [rsp+gprsize*2+16*46], m4 ;out43 5559 mova [rsp+gprsize*2+16*23], m0 ;out20 5560 mova [rsp+gprsize*2+16*55], m6 ;out52 5561 mova [rsp+gprsize*2+16*14], m5 ;out11 5562 5563 mova m0, [rsp+gprsize*2+16*40] ;t37a 5564 mova m5, [rsp+gprsize*2+16*45] ;t42a 5565 mova m3, [rsp+gprsize*2+16*56] ;t53a 5566 mova m1, [rsp+gprsize*2+16*61] ;t58a 5567 mova m2, [rsp+gprsize*2+16*29] ;tmp[26] 5568 psubsw m4, m0, m5 ;t42 5569 paddsw m0, m5 ;t37 5570 psubsw m5, m1, m3 ;t53 5571 paddsw m1, m3 ;t58 5572 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t43, t52 5573 mova m3, [rsp+gprsize*2+16*8 ] ;tmp[5 ] 5574 psubsw m6, m2, m0 ;out37 5575 paddsw m2, m0 ;out26 5576 psubsw m0, m3, m1 ;out58 5577 paddsw m3, m1 ;out5 5578 mova [rsp+gprsize*2+16*40], m6 ;out37 5579 mova [rsp+gprsize*2+16*29], m2 ;out26 5580 mova [rsp+gprsize*2+16*61], m0 ;out58 5581 mova [rsp+gprsize*2+16*8 ], m3 ;out5 5582 mova m0, [rsp+gprsize*2+16*24] ;tmp[21] 5583 mova m1, [rsp+gprsize*2+16*13] ;tmp[10] 5584 psubsw m2, m0, m5 ;out42 5585 paddsw m0, m5 ;out21 5586 psubsw m3, m1, m4 ;out53 5587 paddsw m1, m4 ;out10 5588 mova [rsp+gprsize*2+16*45], m2 ;out42 5589 mova [rsp+gprsize*2+16*24], m0 ;out21 5590 mova [rsp+gprsize*2+16*56], m3 ;out53 5591 mova [rsp+gprsize*2+16*13], m1 ;out10 5592 5593 mova m0, [rsp+gprsize*2+16*41] ;t38 5594 mova m5, [rsp+gprsize*2+16*44] ;t41 5595 mova m3, [rsp+gprsize*2+16*57] ;t54 5596 mova m1, [rsp+gprsize*2+16*60] ;t57 5597 mova m2, [rsp+gprsize*2+16*28] ;tmp[25] 5598 psubsw m4, m0, m5 ;t41a 5599 paddsw m0, m5 ;t38a 5600 psubsw m5, m1, m3 ;t54a 5601 paddsw m1, m3 ;t57a 5602 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t41a, t54a 5603 mova m3, [rsp+gprsize*2+16*9 ] ;tmp[6 ] 5604 psubsw m6, m2, m0 ;out38 5605 paddsw m2, m0 ;out25 5606 psubsw m0, m3, m1 ;out57 5607 paddsw m3, m1 ;out6 5608 mova [rsp+gprsize*2+16*41], m6 ;out38 5609 mova [rsp+gprsize*2+16*28], m2 ;out25 5610 mova [rsp+gprsize*2+16*60], m0 ;out57 5611 mova [rsp+gprsize*2+16*9 ], m3 ;out6 5612 mova m0, [rsp+gprsize*2+16*25] ;tmp[22] 5613 mova m1, [rsp+gprsize*2+16*12] ;tmp[9 ] 5614 psubsw m2, m0, m5 ;out41 5615 paddsw m0, m5 ;out22 5616 psubsw m3, m1, m4 ;out54 5617 paddsw m1, m4 ;out9 5618 mova [rsp+gprsize*2+16*44], m2 ;out41 5619 mova [rsp+gprsize*2+16*25], m0 ;out22 5620 mova [rsp+gprsize*2+16*57], m3 ;out54 5621 mova [rsp+gprsize*2+16*12], m1 ;out9 5622 5623 mova m0, [rsp+gprsize*2+16*42] ;t39a 5624 mova m5, [rsp+gprsize*2+16*43] ;t40a 5625 mova m3, [rsp+gprsize*2+16*58] ;t55a 5626 mova m1, [rsp+gprsize*2+16*59] ;t56a 5627 mova m2, [rsp+gprsize*2+16*27] ;tmp[24] 5628 psubsw m4, m0, m5 ;t40 5629 paddsw m0, m5 ;t39 5630 psubsw m5, m1, m3 ;t55 5631 paddsw m1, m3 ;t56 5632 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t40a, t55a 5633 mova m3, [rsp+gprsize*2+16*10] ;tmp[7 ] 5634 psubsw m6, m2, m0 ;out39 5635 paddsw m2, m0 ;out24 5636 psubsw m0, m3, m1 ;out56 5637 paddsw m3, m1 ;out7 5638 mova [rsp+gprsize*2+16*42], m6 ;out39 5639 mova [rsp+gprsize*2+16*27], m2 ;out24 5640 mova [rsp+gprsize*2+16*59], m0 ;out56 5641 mova [rsp+gprsize*2+16*10], m3 ;out7 5642 mova m0, [rsp+gprsize*2+16*26] ;tmp[23] 5643 mova m1, [rsp+gprsize*2+16*11] ;tmp[8 ] 5644 psubsw m2, m0, m5 ;out40 5645 paddsw m0, m5 ;out23 5646 psubsw m3, m1, m4 ;out55 5647 paddsw m1, m4 ;out8 5648 mova [rsp+gprsize*2+16*43], m2 ;out40 5649 mova [rsp+gprsize*2+16*26], m0 ;out23 5650 mova [rsp+gprsize*2+16*58], m3 ;out55 5651 mova [rsp+gprsize*2+16*11], m1 ;out8 5652 5653 mova m0, [rsp+gprsize*2+16*37] ;t34 5654 mova m5, [rsp+gprsize*2+16*48] ;t45 5655 mova m3, [rsp+gprsize*2+16*53] ;t50 5656 mova m1, [rsp+gprsize*2+16*64] ;t61 5657 mova m2, [rsp+gprsize*2+16*32] ;tmp[29] 5658 psubsw m4, m0, m5 ;t45a 5659 paddsw m0, m5 ;t34a 5660 psubsw m5, m1, m3 ;t50a 5661 paddsw m1, m3 ;t61a 5662 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 5663 mova m3, [rsp+gprsize*2+16*5 ] ;tmp[2 ] 5664 psubsw m6, m2, m0 ;out34 5665 paddsw m2, m0 ;out29 5666 psubsw m0, m3, m1 ;out61 5667 paddsw m3, m1 ;out2 5668 mova [rsp+gprsize*2+16*37], m6 ;out34 5669 mova [rsp+gprsize*2+16*32], m2 ;out29 5670 mova [rsp+gprsize*2+16*64], m0 ;out61 5671 mova [rsp+gprsize*2+16*5 ], m3 ;out2 5672 mova m0, [rsp+gprsize*2+16*21] ;tmp[18] 5673 mova m1, [rsp+gprsize*2+16*16] ;tmp[13] 5674 psubsw m2, m0, m5 ;out45 5675 paddsw m0, m5 ;out18 5676 psubsw m3, m1, m4 ;out50 5677 paddsw m1, m4 ;out13 5678 mova [rsp+gprsize*2+16*48], m2 ;out45 5679 mova [rsp+gprsize*2+16*21], m0 ;out18 5680 mova [rsp+gprsize*2+16*53], m3 ;out50 5681 mova [rsp+gprsize*2+16*16], m1 ;out13 5682 5683 mova m0, [rsp+gprsize*2+16*36] ;t33a 5684 mova m5, [rsp+gprsize*2+16*49] ;t46a 5685 mova m3, [rsp+gprsize*2+16*52] ;t49a 5686 mova m1, [rsp+gprsize*2+16*65] ;t62a 5687 mova m2, [rsp+gprsize*2+16*33] ;tmp[30] 5688 psubsw m4, m0, m5 ;t46 5689 paddsw m0, m5 ;t33 5690 psubsw m5, m1, m3 ;t49 5691 paddsw m1, m3 ;t62 5692 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 5693 mova m3, [rsp+gprsize*2+16*4 ] ;tmp[1 ] 5694 psubsw m6, m2, m0 ;out33 5695 paddsw m2, m0 ;out30 5696 psubsw m0, m3, m1 ;out62 5697 paddsw m3, m1 ;out1 5698 mova [rsp+gprsize*2+16*36], m6 ;out33 5699 mova [rsp+gprsize*2+16*33], m2 ;out30 5700 mova [rsp+gprsize*2+16*65], m0 ;out62 5701 mova [rsp+gprsize*2+16*4 ], m3 ;out1 5702 mova m0, [rsp+gprsize*2+16*20] ;tmp[17] 5703 mova m1, [rsp+gprsize*2+16*17] ;tmp[14] 5704 psubsw m2, m0, m5 ;out46 5705 paddsw m0, m5 ;out17 5706 psubsw m3, m1, m4 ;out49 5707 paddsw m1, m4 ;out14 5708 mova [rsp+gprsize*2+16*49], m2 ;out46 5709 mova [rsp+gprsize*2+16*20], m0 ;out17 5710 mova [rsp+gprsize*2+16*52], m3 ;out49 5711 mova [rsp+gprsize*2+16*17], m1 ;out14 5712 5713 mova m0, [rsp+gprsize*2+16*35] ;t32 5714 mova m5, [rsp+gprsize*2+16*50] ;t47 5715 mova m3, [rsp+gprsize*2+16*51] ;t48 5716 mova m1, [rsp+gprsize*2+16*66] ;t63 5717 mova m2, [rsp+gprsize*2+16*34] ;tmp[31] 5718 psubsw m4, m0, m5 ;t47a 5719 paddsw m0, m5 ;t32a 5720 psubsw m5, m1, m3 ;t48a 5721 paddsw m1, m3 ;t63a 5722 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t47, t48 5723 mova m3, [rsp+gprsize*2+16*3 ] ;tmp[0 ] 5724 psubsw m6, m2, m0 ;out32 5725 paddsw m2, m0 ;out31 5726 psubsw m0, m3, m1 ;out63 5727 paddsw m3, m1 ;out0 5728 mova [rsp+gprsize*2+16*35], m6 ;out32 5729 mova [rsp+gprsize*2+16*34], m2 ;out31 5730 mova [rsp+gprsize*2+16*66], m0 ;out63 5731 mova [rsp+gprsize*2+16*3 ], m3 ;out0 5732 mova m0, [rsp+gprsize*2+16*19] ;tmp[16] 5733 mova m1, [rsp+gprsize*2+16*18] ;tmp[15] 5734 psubsw m2, m0, m5 ;out47 5735 paddsw m0, m5 ;out16 5736 psubsw m3, m1, m4 ;out48 5737 paddsw m1, m4 ;out15 5738 mova [rsp+gprsize*2+16*50], m2 ;out47 5739 mova [rsp+gprsize*2+16*19], m0 ;out16 5740 mova [rsp+gprsize*2+16*51], m3 ;out48 5741 mova [rsp+gprsize*2+16*18], m1 ;out15 5742 ret 5743 5744 5745cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2 5746%if ARCH_X86_32 5747 LEA r5, $$ 5748%endif 5749 test eobd, eobd 5750 jz .dconly 5751 5752 call m(idct_64x16_internal_8bpc) 5753 RET 5754 5755.dconly: 5756 movd m1, [o(pw_2896x8)] 5757 pmulhrsw m0, m1, [coeffq] 5758 movd m2, [o(pw_8192)] 5759 mov [coeffq], eobd 5760 mov r3d, 16 5761 lea tx2q, [o(.end)] 5762 5763.body: 5764 pmulhrsw m0, m2 5765 movd m2, [o(pw_2048)] ;intentionally rip-relative 5766 pmulhrsw m0, m1 5767 pmulhrsw m0, m2 5768 pshuflw m0, m0, q0000 5769 punpcklwd m0, m0 5770 pxor m7, m7 5771 5772.loop: 5773 mova m1, [dstq+16*0] 5774 mova m3, [dstq+16*1] 5775 mova m5, [dstq+16*2] 5776 mova m6, [dstq+16*3] 5777 punpckhbw m2, m1, m7 5778 punpcklbw m1, m7 5779 punpckhbw m4, m3, m7 5780 punpcklbw m3, m7 5781 paddw m2, m0 5782 paddw m1, m0 5783 paddw m4, m0 5784 paddw m3, m0 5785 packuswb m1, m2 5786 packuswb m3, m4 5787 punpckhbw m2, m5, m7 5788 punpcklbw m5, m7 5789 punpckhbw m4, m6, m7 5790 punpcklbw m6, m7 5791 paddw m2, m0 5792 paddw m5, m0 5793 paddw m4, m0 5794 paddw m6, m0 5795 packuswb m5, m2 5796 packuswb m6, m4 5797 mova [dstq+16*0], m1 5798 mova [dstq+16*1], m3 5799 mova [dstq+16*2], m5 5800 mova [dstq+16*3], m6 5801 add dstq, strideq 5802 dec r3d 5803 jg .loop 5804 jmp tx2q 5805 5806.end: 5807 RET 5808 5809 5810%macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2 5811 5812%if %3 5813 mova m3, [o(pw_2896x8)] 5814 pmulhrsw m0, m3, [%1+%2*0] 5815 pmulhrsw m1, m3, [%1+%2*1] 5816 pmulhrsw m2, m3, [%1+%2*2] 5817 pmulhrsw m3, [%1+%2*3] 5818%else 5819 mova m0, [%1+%2*0] 5820 mova m1, [%1+%2*1] 5821 mova m2, [%1+%2*2] 5822 mova m3, [%1+%2*3] 5823%endif 5824%endmacro 5825 5826%macro LOAD_4ROWS_H 2 ;src, stride 5827 mova m4, [%1+%2*0] 5828 mova m5, [%1+%2*1] 5829 mova m6, [%1+%2*2] 5830 mova m7, [%1+%2*3] 5831%endmacro 5832 5833cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 5834 mov r3d, 2 5835 mov [rsp+gprsize*2+16*67], dstq 5836 lea dstq, [rsp+gprsize+16*68] 5837 5838.pass1_loop: 5839 LOAD_4ROWS coeffq+32*0, 32*8 5840 pxor m4, m4 5841 REPX {mova x, m4}, m5, m6, m7 5842 call m(idct_8x8_internal_8bpc).main 5843 SAVE_7ROWS rsp+gprsize+16*3, 16 5844 5845 pxor m4, m4 5846 LOAD_4ROWS coeffq+32*4, 32*8 5847 5848 REPX {mova x, m4}, m5, m6, m7 5849 call m(idct_16x8_internal_8bpc).main 5850 mova m7, [rsp+gprsize+16*0] 5851 SAVE_8ROWS rsp+gprsize+16*11, 16 5852 5853 LOAD_8ROWS coeffq+32*2, 32*4 5854 mova [rsp+gprsize+16*19], m0 5855 mova [rsp+gprsize+16*26], m1 5856 mova [rsp+gprsize+16*23], m2 5857 mova [rsp+gprsize+16*22], m3 5858 mova [rsp+gprsize+16*21], m4 5859 mova [rsp+gprsize+16*24], m5 5860 mova [rsp+gprsize+16*25], m6 5861 mova [rsp+gprsize+16*20], m7 5862 5863 call m(idct_8x32_internal_8bpc).main_fast 5864 SAVE_8ROWS rsp+gprsize+16*3, 16 5865 5866 LOAD_8ROWS coeffq+32*1, 32*2 5867 mova [rsp+gprsize+16*35], m0 ;in1 5868 mova [rsp+gprsize+16*49], m1 ;in3 5869 mova [rsp+gprsize+16*43], m2 ;in5 5870 mova [rsp+gprsize+16*41], m3 ;in7 5871 mova [rsp+gprsize+16*39], m4 ;in9 5872 mova [rsp+gprsize+16*45], m5 ;in11 5873 mova [rsp+gprsize+16*47], m6 ;in13 5874 mova [rsp+gprsize+16*37], m7 ;in15 5875 5876 LOAD_8ROWS coeffq+32*17, 32*2 5877 mova [rsp+gprsize+16*63], m0 ;in17 5878 mova [rsp+gprsize+16*53], m1 ;in19 5879 mova [rsp+gprsize+16*55], m2 ;in21 5880 mova [rsp+gprsize+16*61], m3 ;in23 5881 mova [rsp+gprsize+16*59], m4 ;in25 5882 mova [rsp+gprsize+16*57], m5 ;in27 5883 mova [rsp+gprsize+16*51], m6 ;in29 5884 mova [rsp+gprsize+16*65], m7 ;in31 5885 5886 call m(idct_16x64_internal_8bpc).main 5887 5888 LOAD_8ROWS rsp+gprsize+16*3, 16 5889 mova [rsp+gprsize+16*0], m7 5890 mova m7, [o(pw_8192)] 5891 lea tx2q, [o(.pass1_end)] 5892 jmp m(idct_8x8_internal_8bpc).pass1_end1 5893 5894.pass1_end: 5895 SAVE_8ROWS coeffq+32*0, 32 5896 LOAD_8ROWS rsp+gprsize+16*11, 16 5897 mova [rsp+gprsize+16*0], m7 5898 mova m7, [o(pw_8192)] 5899 lea tx2q, [o(.pass1_end1)] 5900 jmp m(idct_8x8_internal_8bpc).pass1_end1 5901 5902.pass1_end1: 5903 SAVE_8ROWS coeffq+32*8, 32 5904 LOAD_8ROWS rsp+gprsize+16*19, 16 5905 mova [rsp+gprsize+16*0], m7 5906 mova m7, [o(pw_8192)] 5907 lea tx2q, [o(.pass1_end2)] 5908 jmp m(idct_8x8_internal_8bpc).pass1_end1 5909 5910.pass1_end2: 5911 SAVE_8ROWS coeffq+32*16, 32 5912 LOAD_8ROWS rsp+gprsize+16*27, 16 5913 mova [rsp+gprsize+16*0], m7 5914 mova m7, [o(pw_8192)] 5915 lea tx2q, [o(.pass1_end3)] 5916 jmp m(idct_8x8_internal_8bpc).pass1_end1 5917 5918.pass1_end3: 5919 SAVE_8ROWS coeffq+32*24, 32 5920 LOAD_8ROWS rsp+gprsize+16*35, 16 5921 mova [rsp+gprsize+16*0], m7 5922 mova m7, [o(pw_8192)] 5923 lea tx2q, [o(.pass1_end4)] 5924 jmp m(idct_8x8_internal_8bpc).pass1_end1 5925 5926.pass1_end4: 5927 SAVE_8ROWS dstq+32*0, 32 5928 LOAD_8ROWS rsp+gprsize+16*43, 16 5929 mova [rsp+gprsize+16*0], m7 5930 mova m7, [o(pw_8192)] 5931 lea tx2q, [o(.pass1_end5)] 5932 jmp m(idct_8x8_internal_8bpc).pass1_end1 5933 5934.pass1_end5: 5935 SAVE_8ROWS dstq+32*8, 32 5936 LOAD_8ROWS rsp+gprsize+16*51, 16 5937 mova [rsp+gprsize+16*0], m7 5938 mova m7, [o(pw_8192)] 5939 lea tx2q, [o(.pass1_end6)] 5940 jmp m(idct_8x8_internal_8bpc).pass1_end1 5941 5942.pass1_end6: 5943 SAVE_8ROWS dstq+32*16, 32 5944 LOAD_8ROWS rsp+gprsize+16*59, 16 5945 mova [rsp+gprsize+16*0], m7 5946 mova m7, [o(pw_8192)] 5947 lea tx2q, [o(.pass1_end7)] 5948 jmp m(idct_8x8_internal_8bpc).pass1_end1 5949 5950.pass1_end7: 5951 SAVE_8ROWS dstq+32*24, 32 5952 5953 add coeffq, 16 5954 add dstq, 16 5955 dec r3d 5956 jg .pass1_loop 5957 5958.pass2: 5959 mov dstq, [rsp+gprsize*2+16*67] 5960 sub coeffq, 32 5961 mov r3d, 4 5962 5963.pass2_loop: 5964 mov [rsp+gprsize*1+16*67], r3d 5965 5966 LOAD_4ROWS coeffq+16*0, 32*2 5967 LOAD_4ROWS_H coeffq+16*1, 32*2 5968 call m(idct_8x8_internal_8bpc).main 5969 SAVE_7ROWS rsp+gprsize+16*3, 16 5970 LOAD_4ROWS coeffq+16*2, 32*2 5971 LOAD_4ROWS_H coeffq+16*3, 32*2 5972 call m(idct_16x8_internal_8bpc).main 5973 5974 mov r3, dstq 5975 lea tx2q, [o(.end)] 5976 lea dstq, [dstq+strideq*8] 5977 jmp m(idct_8x8_internal_8bpc).end 5978 5979.end: 5980 LOAD_8ROWS rsp+gprsize+16*3, 16 5981 mova [rsp+gprsize+16*0], m7 5982 lea tx2q, [o(.end1)] 5983 mov dstq, r3 5984 jmp m(idct_8x8_internal_8bpc).end 5985 5986.end1: 5987 pxor m7, m7 5988 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 5989 5990 add coeffq, 16*16 5991 mov r3d, [rsp+gprsize*1+16*67] 5992 mov dstq, [rsp+gprsize*2+16*67] 5993 add dstq, 8 5994 mov [rsp+gprsize*2+16*67], dstq 5995 dec r3d 5996 jg .pass2_loop 5997 5998 mov r3d, 4 5999 lea coeffq, [rsp+gprsize+16*68] 6000.pass2_loop2: 6001 mov [rsp+gprsize*1+16*67], r3d 6002 6003 LOAD_4ROWS coeffq+16*0, 32*2 6004 LOAD_4ROWS_H coeffq+16*1, 32*2 6005 call m(idct_8x8_internal_8bpc).main 6006 SAVE_7ROWS rsp+gprsize+16*3, 16 6007 LOAD_4ROWS coeffq+16*2, 32*2 6008 LOAD_4ROWS_H coeffq+16*3, 32*2 6009 call m(idct_16x8_internal_8bpc).main 6010 6011 mov r3, dstq 6012 lea tx2q, [o(.end2)] 6013 lea dstq, [dstq+strideq*8] 6014 jmp m(idct_8x8_internal_8bpc).end 6015 6016.end2: 6017 LOAD_8ROWS rsp+gprsize+16*3, 16 6018 mova [rsp+gprsize+16*0], m7 6019 lea tx2q, [o(.end3)] 6020 mov dstq, r3 6021 jmp m(idct_8x8_internal_8bpc).end 6022 6023.end3: 6024 6025 add coeffq, 16*16 6026 mov r3d, [rsp+gprsize*1+16*67] 6027 mov dstq, [rsp+gprsize*2+16*67] 6028 add dstq, 8 6029 mov [rsp+gprsize*2+16*67], dstq 6030 dec r3d 6031 jg .pass2_loop2 6032 ret 6033 6034 6035cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 6036%if ARCH_X86_32 6037 LEA r5, $$ 6038%endif 6039 test eobd, eobd 6040 jz .dconly 6041 call m(idct_32x64_internal_8bpc) 6042.end: 6043 RET 6044 6045.dconly: 6046 movd m1, [o(pw_2896x8)] 6047 pmulhrsw m0, m1, [coeffq] 6048 movd m2, [o(pw_16384)] 6049 mov [coeffq], eobd 6050 pmulhrsw m0, m1 6051 mov r3d, 64 6052 lea tx2q, [o(.end)] 6053 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body 6054 6055 6056cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 6057 mov r4d, 2 6058 sub eobd, 136 6059 mov [rsp+gprsize*1+16*67], eobd 6060 mov r3d, 4 6061 cmovs r3d, r4d 6062 6063%if ARCH_X86_32 6064 LEA r5, $$ 6065%endif 6066 6067 mov [rsp+gprsize*2+16*67], coeffq 6068 6069.pass1_loop: 6070 LOAD_8ROWS coeffq+64*1, 64*2, 1 6071 mova [rsp+gprsize+16*19], m0 ;in1 6072 mova [rsp+gprsize+16*26], m1 ;in3 6073 mova [rsp+gprsize+16*23], m2 ;in5 6074 mova [rsp+gprsize+16*22], m3 ;in7 6075 mova [rsp+gprsize+16*21], m4 ;in9 6076 mova [rsp+gprsize+16*24], m5 ;in11 6077 mova [rsp+gprsize+16*25], m6 ;in13 6078 mova [rsp+gprsize+16*20], m7 ;in15 6079 6080 mov tx2d, [rsp+gprsize*1+16*67] 6081 test tx2d, tx2d 6082 jl .fast 6083 6084.full: 6085 LOAD_8ROWS coeffq+64*0, 64*4, 1 6086 call m(idct_8x8_internal_8bpc).main 6087 SAVE_7ROWS rsp+gprsize+16*3, 16 6088 LOAD_8ROWS coeffq+64*2, 64*4, 1 6089 call m(idct_16x8_internal_8bpc).main 6090 mova m7, [rsp+gprsize+16*0] 6091 SAVE_8ROWS rsp+gprsize+16*11, 16 6092 6093 LOAD_8ROWS coeffq+64*17, 64*2, 1 6094 mova [rsp+gprsize+16*33], m0 ;in17 6095 mova [rsp+gprsize+16*28], m1 ;in19 6096 mova [rsp+gprsize+16*29], m2 ;in21 6097 mova [rsp+gprsize+16*32], m3 ;in23 6098 mova [rsp+gprsize+16*31], m4 ;in25 6099 mova [rsp+gprsize+16*30], m5 ;in27 6100 mova [rsp+gprsize+16*27], m6 ;in29 6101 mova [rsp+gprsize+16*34], m7 ;in31 6102 6103 call m(idct_8x32_internal_8bpc).main 6104 jmp .pass1_end 6105 6106.fast: 6107 LOAD_4ROWS coeffq, 256, 1 6108 pxor m4, m4 6109 REPX {mova x, m4}, m5, m6, m7 6110 call m(idct_8x8_internal_8bpc).main 6111 6112 SAVE_7ROWS rsp+gprsize+16*3, 16 6113 LOAD_4ROWS coeffq+128*1, 256, 1 6114 pxor m4, m4 6115 REPX {mova x, m4}, m5, m6, m7 6116 call m(idct_16x8_internal_8bpc).main 6117 mova m7, [rsp+gprsize+16*0] 6118 SAVE_8ROWS rsp+gprsize+16*11, 16 6119 6120 call m(idct_8x32_internal_8bpc).main_fast 6121 6122.pass1_end: 6123 mova [rsp+gprsize+16*0], m7 6124 lea tx2q, [o(.pass1_end1)] 6125 jmp m(idct_8x8_internal_8bpc).pass1_end 6126 6127.pass1_end1: 6128 SAVE_8ROWS coeffq+64*0, 64 6129 LOAD_8ROWS rsp+gprsize+16*11, 16 6130 mova [rsp+gprsize+16*0], m7 6131 lea tx2q, [o(.pass1_end2)] 6132 jmp m(idct_8x8_internal_8bpc).pass1_end 6133 6134.pass1_end2: 6135 SAVE_8ROWS coeffq+64*8, 64 6136 LOAD_8ROWS rsp+gprsize+16*19, 16 6137 mova [rsp+gprsize+16*0], m7 6138 lea tx2q, [o(.pass1_end3)] 6139 jmp m(idct_8x8_internal_8bpc).pass1_end 6140 6141.pass1_end3: 6142 SAVE_8ROWS coeffq+64*16, 64 6143 LOAD_8ROWS rsp+gprsize+16*27, 16 6144 mova [rsp+gprsize+16*0], m7 6145 lea tx2q, [o(.pass1_end4)] 6146 jmp m(idct_8x8_internal_8bpc).pass1_end 6147 6148.pass1_end4: 6149 SAVE_8ROWS coeffq+64*24, 64 6150 6151 add coeffq, 16 6152 dec r3d 6153 jg .pass1_loop 6154 6155.pass2: 6156 mov coeffq, [rsp+gprsize*2+16*67] 6157 mov r3d, 4 6158 lea r4, [dstq+8] 6159 mov [rsp+gprsize*2+16*67], r4 6160 lea r4, [o(m(idct_16x64_internal_8bpc).end1)] 6161 jmp m(idct_16x64_internal_8bpc).pass2_loop 6162 6163 6164cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 6165%if ARCH_X86_32 6166 LEA r5, $$ 6167%endif 6168 test eobd, eobd 6169 jz .dconly 6170 call m(idct_64x32_internal_8bpc) 6171.end: 6172 RET 6173 6174.dconly: 6175 movd m1, [o(pw_2896x8)] 6176 pmulhrsw m0, m1, [coeffq] 6177 movd m2, [o(pw_16384)] 6178 pmulhrsw m0, m1 6179 mov [coeffq], eobd 6180 mov r3d, 32 6181 lea tx2q, [o(.end)] 6182 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body 6183 6184 6185cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 6186 mov r4d, 2 6187 sub eobd, 136 6188 mov [rsp+gprsize*1+16*67], eobd 6189 mov r3d, 4 6190 cmovs r3d, r4d 6191 6192%if ARCH_X86_32 6193 LEA r5, $$ 6194%endif 6195 6196 mov [rsp+gprsize*2+16*67], coeffq 6197 mov [rsp+gprsize*3+16*67], dstq 6198 lea dstq, [rsp+gprsize+16*69] 6199 mov [rsp+gprsize*4+16*67], dstq 6200 6201.pass1_loop: 6202 LOAD_4ROWS coeffq+64*0, 64*8, 1 6203 pxor m4, m4 6204 REPX {mova x, m4}, m5, m6, m7 6205 call m(idct_8x8_internal_8bpc).main 6206 SAVE_7ROWS rsp+gprsize+16*3, 16 6207 6208 pxor m4, m4 6209 LOAD_4ROWS coeffq+64*4, 64*8, 1 6210 6211 REPX {mova x, m4}, m5, m6, m7 6212 call m(idct_16x8_internal_8bpc).main 6213 mova m7, [rsp+gprsize+16*0] 6214 SAVE_8ROWS rsp+gprsize+16*11, 16 6215 6216 LOAD_8ROWS coeffq+64*2, 64*4, 1 6217 mova [rsp+gprsize+16*19], m0 6218 mova [rsp+gprsize+16*26], m1 6219 mova [rsp+gprsize+16*23], m2 6220 mova [rsp+gprsize+16*22], m3 6221 mova [rsp+gprsize+16*21], m4 6222 mova [rsp+gprsize+16*24], m5 6223 mova [rsp+gprsize+16*25], m6 6224 mova [rsp+gprsize+16*20], m7 6225 6226 call m(idct_8x32_internal_8bpc).main_fast 6227 SAVE_8ROWS rsp+gprsize+16*3, 16 6228 6229 LOAD_8ROWS coeffq+64*1, 64*2, 1 6230 mova [rsp+gprsize+16*35], m0 ;in1 6231 mova [rsp+gprsize+16*49], m1 ;in3 6232 mova [rsp+gprsize+16*43], m2 ;in5 6233 mova [rsp+gprsize+16*41], m3 ;in7 6234 mova [rsp+gprsize+16*39], m4 ;in9 6235 mova [rsp+gprsize+16*45], m5 ;in11 6236 mova [rsp+gprsize+16*47], m6 ;in13 6237 mova [rsp+gprsize+16*37], m7 ;in15 6238 6239 LOAD_8ROWS coeffq+64*17, 64*2, 1 6240 mova [rsp+gprsize+16*63], m0 ;in17 6241 mova [rsp+gprsize+16*53], m1 ;in19 6242 mova [rsp+gprsize+16*55], m2 ;in21 6243 mova [rsp+gprsize+16*61], m3 ;in23 6244 mova [rsp+gprsize+16*59], m4 ;in25 6245 mova [rsp+gprsize+16*57], m5 ;in27 6246 mova [rsp+gprsize+16*51], m6 ;in29 6247 mova [rsp+gprsize+16*65], m7 ;in31 6248 6249 call m(idct_16x64_internal_8bpc).main 6250 6251 LOAD_8ROWS rsp+gprsize+16*3, 16 6252 mova [rsp+gprsize+16*0], m7 6253 lea tx2q, [o(.pass1_end)] 6254 jmp m(idct_8x8_internal_8bpc).pass1_end 6255 6256.pass1_end: 6257 SAVE_8ROWS coeffq+64*0, 64 6258 LOAD_8ROWS rsp+gprsize+16*11, 16 6259 mova [rsp+gprsize+16*0], m7 6260 lea tx2q, [o(.pass1_end1)] 6261 jmp m(idct_8x8_internal_8bpc).pass1_end 6262 6263.pass1_end1: 6264 SAVE_8ROWS coeffq+64*8, 64 6265 LOAD_8ROWS rsp+gprsize+16*19, 16 6266 mova [rsp+gprsize+16*0], m7 6267 lea tx2q, [o(.pass1_end2)] 6268 jmp m(idct_8x8_internal_8bpc).pass1_end 6269 6270.pass1_end2: 6271 SAVE_8ROWS coeffq+64*16, 64 6272 LOAD_8ROWS rsp+gprsize+16*27, 16 6273 mova [rsp+gprsize+16*0], m7 6274 lea tx2q, [o(.pass1_end3)] 6275 jmp m(idct_8x8_internal_8bpc).pass1_end 6276 6277.pass1_end3: 6278 SAVE_8ROWS coeffq+64*24, 64 6279 LOAD_8ROWS rsp+gprsize+16*35, 16 6280 mova [rsp+gprsize+16*0], m7 6281 lea tx2q, [o(.pass1_end4)] 6282 jmp m(idct_8x8_internal_8bpc).pass1_end 6283 6284.pass1_end4: 6285 SAVE_8ROWS dstq+64*0, 64 6286 LOAD_8ROWS rsp+gprsize+16*43, 16 6287 mova [rsp+gprsize+16*0], m7 6288 lea tx2q, [o(.pass1_end5)] 6289 jmp m(idct_8x8_internal_8bpc).pass1_end 6290 6291.pass1_end5: 6292 SAVE_8ROWS dstq+64*8, 64 6293 LOAD_8ROWS rsp+gprsize+16*51, 16 6294 mova [rsp+gprsize+16*0], m7 6295 lea tx2q, [o(.pass1_end6)] 6296 jmp m(idct_8x8_internal_8bpc).pass1_end 6297 6298.pass1_end6: 6299 SAVE_8ROWS dstq+64*16, 64 6300 LOAD_8ROWS rsp+gprsize+16*59, 16 6301 mova [rsp+gprsize+16*0], m7 6302 lea tx2q, [o(.pass1_end7)] 6303 jmp m(idct_8x8_internal_8bpc).pass1_end 6304 6305.pass1_end7: 6306 SAVE_8ROWS dstq+64*24, 64 6307 6308 add coeffq, 16 6309 add dstq, 16 6310 dec r3d 6311 jg .pass1_loop 6312 6313.pass2: 6314 mov coeffq, [rsp+gprsize*4+16*67] 6315 mov dstq, [rsp+gprsize*3+16*67] 6316 mov eobd, [rsp+gprsize*1+16*67] 6317 lea dstq, [dstq+32] 6318 mov [rsp+gprsize*1+16*35], eobd 6319 lea tx2q, [o(.pass2_end)] 6320 mov r3d, 4 6321 jmp m(idct_32x32_internal_8bpc).pass2_loop 6322 6323.pass2_end: 6324 mova [rsp+gprsize+16*0], m7 6325 lea r3, [o(.pass2_end1)] 6326 jmp m(idct_8x32_internal_8bpc).end2 6327 6328.pass2_end1: 6329 lea tx2q, [o(.pass2_end)] 6330 add coeffq, 16*32 6331 mov dstq, [rsp+gprsize*2+16*35] 6332 mov r3d, [rsp+gprsize*3+16*35] 6333 dec r3d 6334 jg m(idct_32x32_internal_8bpc).pass2_loop 6335 6336.pass2_end2: 6337 mov dstq, [rsp+gprsize*3+16*67] 6338 mov coeffq, [rsp+gprsize*2+16*67] 6339 lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] 6340 mov r3d, 4 6341 jmp m(idct_32x32_internal_8bpc).pass2_loop 6342 6343 6344cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 6345%if ARCH_X86_32 6346 LEA r5, $$ 6347%endif 6348 test eobd, eobd 6349 jz .dconly 6350 6351 call m(idct_64x64_internal_8bpc) 6352 RET 6353 6354.dconly: 6355 movd m1, [o(pw_2896x8)] 6356 pmulhrsw m0, m1, [coeffq] 6357 movd m2, [o(pw_8192)] 6358 mov [coeffq], eobd 6359 mov r3d, 64 6360 lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)] 6361 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body 6362 6363cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 6364 mov r5d, 4 6365 mov r4d, 2 6366 sub eobd, 136 6367 cmovns r4d, r5d 6368 6369%if ARCH_X86_32 6370 LEA r5, $$ 6371%endif 6372 6373 mov [rsp+gprsize*1+16*67], eobd 6374 mov r3d, r4d 6375 mov [rsp+gprsize*4+16*67], coeffq 6376 mov [rsp+gprsize*3+16*67], dstq 6377 lea dstq, [rsp+gprsize+16*69] 6378 mov [rsp+gprsize*2+16*67], dstq 6379 6380.pass1_loop: 6381 LOAD_4ROWS coeffq+64*0, 64*8 6382 pxor m4, m4 6383 REPX {mova x, m4}, m5, m6, m7 6384 call m(idct_8x8_internal_8bpc).main 6385 SAVE_7ROWS rsp+gprsize+16*3, 16 6386 6387 pxor m4, m4 6388 LOAD_4ROWS coeffq+64*4, 64*8 6389 6390 REPX {mova x, m4}, m5, m6, m7 6391 call m(idct_16x8_internal_8bpc).main 6392 mova m7, [rsp+gprsize+16*0] 6393 SAVE_8ROWS rsp+gprsize+16*11, 16 6394 6395 LOAD_8ROWS coeffq+64*2, 64*4 6396 mova [rsp+gprsize+16*19], m0 6397 mova [rsp+gprsize+16*26], m1 6398 mova [rsp+gprsize+16*23], m2 6399 mova [rsp+gprsize+16*22], m3 6400 mova [rsp+gprsize+16*21], m4 6401 mova [rsp+gprsize+16*24], m5 6402 mova [rsp+gprsize+16*25], m6 6403 mova [rsp+gprsize+16*20], m7 6404 6405 call m(idct_8x32_internal_8bpc).main_fast 6406 SAVE_8ROWS rsp+gprsize+16*3, 16 6407 6408 LOAD_8ROWS coeffq+64*1, 64*2 6409 mova [rsp+gprsize+16*35], m0 ;in1 6410 mova [rsp+gprsize+16*49], m1 ;in3 6411 mova [rsp+gprsize+16*43], m2 ;in5 6412 mova [rsp+gprsize+16*41], m3 ;in7 6413 mova [rsp+gprsize+16*39], m4 ;in9 6414 mova [rsp+gprsize+16*45], m5 ;in11 6415 mova [rsp+gprsize+16*47], m6 ;in13 6416 mova [rsp+gprsize+16*37], m7 ;in15 6417 6418 LOAD_8ROWS coeffq+64*17, 64*2 6419 mova [rsp+gprsize+16*63], m0 ;in17 6420 mova [rsp+gprsize+16*53], m1 ;in19 6421 mova [rsp+gprsize+16*55], m2 ;in21 6422 mova [rsp+gprsize+16*61], m3 ;in23 6423 mova [rsp+gprsize+16*59], m4 ;in25 6424 mova [rsp+gprsize+16*57], m5 ;in27 6425 mova [rsp+gprsize+16*51], m6 ;in29 6426 mova [rsp+gprsize+16*65], m7 ;in31 6427 6428 call m(idct_16x64_internal_8bpc).main 6429 6430 LOAD_8ROWS rsp+gprsize+16*3, 16 6431 mova [rsp+gprsize+16*0], m7 6432 mova m7, [o(pw_8192)] 6433 lea tx2q, [o(.pass1_end)] 6434 jmp m(idct_8x8_internal_8bpc).pass1_end1 6435 6436.pass1_end: 6437 SAVE_8ROWS coeffq+64*0, 64 6438 LOAD_8ROWS rsp+gprsize+16*11, 16 6439 mova [rsp+gprsize+16*0], m7 6440 mova m7, [o(pw_8192)] 6441 lea tx2q, [o(.pass1_end1)] 6442 jmp m(idct_8x8_internal_8bpc).pass1_end1 6443 6444.pass1_end1: 6445 SAVE_8ROWS coeffq+64*8, 64 6446 LOAD_8ROWS rsp+gprsize+16*19, 16 6447 mova [rsp+gprsize+16*0], m7 6448 mova m7, [o(pw_8192)] 6449 lea tx2q, [o(.pass1_end2)] 6450 jmp m(idct_8x8_internal_8bpc).pass1_end1 6451 6452.pass1_end2: 6453 SAVE_8ROWS coeffq+64*16, 64 6454 LOAD_8ROWS rsp+gprsize+16*27, 16 6455 mova [rsp+gprsize+16*0], m7 6456 mova m7, [o(pw_8192)] 6457 lea tx2q, [o(.pass1_end3)] 6458 jmp m(idct_8x8_internal_8bpc).pass1_end1 6459 6460.pass1_end3: 6461 SAVE_8ROWS coeffq+64*24, 64 6462 LOAD_8ROWS rsp+gprsize+16*35, 16 6463 mova [rsp+gprsize+16*0], m7 6464 mova m7, [o(pw_8192)] 6465 lea tx2q, [o(.pass1_end4)] 6466 jmp m(idct_8x8_internal_8bpc).pass1_end1 6467 6468.pass1_end4: 6469 SAVE_8ROWS dstq+64*0, 64 6470 LOAD_8ROWS rsp+gprsize+16*43, 16 6471 mova [rsp+gprsize+16*0], m7 6472 mova m7, [o(pw_8192)] 6473 lea tx2q, [o(.pass1_end5)] 6474 jmp m(idct_8x8_internal_8bpc).pass1_end1 6475 6476.pass1_end5: 6477 SAVE_8ROWS dstq+64*8, 64 6478 LOAD_8ROWS rsp+gprsize+16*51, 16 6479 mova [rsp+gprsize+16*0], m7 6480 mova m7, [o(pw_8192)] 6481 lea tx2q, [o(.pass1_end6)] 6482 jmp m(idct_8x8_internal_8bpc).pass1_end1 6483 6484.pass1_end6: 6485 SAVE_8ROWS dstq+64*16, 64 6486 LOAD_8ROWS rsp+gprsize+16*59, 16 6487 mova [rsp+gprsize+16*0], m7 6488 mova m7, [o(pw_8192)] 6489 lea tx2q, [o(.pass1_end7)] 6490 jmp m(idct_8x8_internal_8bpc).pass1_end1 6491 6492.pass1_end7: 6493 SAVE_8ROWS dstq+64*24, 64 6494 6495 add coeffq, 16 6496 add dstq, 16 6497 dec r3d 6498 jg .pass1_loop 6499 6500.pass2: 6501 mov dstq, [rsp+gprsize*3+16*67] 6502 mov coeffq, [rsp+gprsize*2+16*67] 6503 lea dstq, [dstq+32] 6504 mov r3d, 4 6505 lea r4, [dstq+8] 6506 mov [rsp+gprsize*2+16*67], r4 6507 lea r4, [o(.pass2_end)] 6508 jmp m(idct_16x64_internal_8bpc).pass2_loop 6509 6510.pass2_end: 6511 LOAD_8ROWS rsp+gprsize+16*35, 16 6512 lea dstq, [dstq+strideq*2] 6513 lea r3, [rsp+16*32+gprsize] 6514 mova [rsp+gprsize+16*0], m7 6515 call m(idct_16x64_internal_8bpc).write 6516 mov dstq, [rsp+gprsize*2+16*67] 6517 mov r3d, [rsp+gprsize*3+16*67] 6518 lea r4, [dstq+8] 6519 mov [rsp+gprsize*2+16*67], r4 6520 lea r4, [o(.pass2_end)] 6521 6522 dec r3d 6523 jg m(idct_16x64_internal_8bpc).pass2_loop 6524 6525.pass2_end2: 6526 mov coeffq, [rsp+gprsize*4+16*67] 6527 mov dstq, [rsp+gprsize*2+16*67] 6528 mov r3d, 4 6529 sub dstq, 72 6530 lea r4, [dstq+8] 6531 mov [rsp+gprsize*2+16*67], r4 6532 lea r4, [o(m(idct_16x64_internal_8bpc).end1)] 6533 jmp m(idct_16x64_internal_8bpc).pass2_loop 6534