1; Copyright © 2018-2021, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29 30SECTION_RODATA 16 31 32deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 33 34deint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 35deint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 36 37%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1 38pw_%1_m%2: times 4 dw %1, -%2 39%if %3 != 2 40pw_%2_%1: times 4 dw %2, %1 41%endif 42%if %3 43pw_m%1_m%2: times 4 dw -%1, -%2 44%endif 45%endmacro 46 47;adst4 48pw_1321_3803: times 4 dw 1321, 3803 49pw_2482_m1321: times 4 dw 2482, -1321 50pw_3344_2482: times 4 dw 3344, 2482 51pw_3344_m3803: times 4 dw 3344, -3803 52pw_3344_m3344: times 4 dw 3344, -3344 53pw_0_3344 times 4 dw 0, 3344 54pw_m6688_m3803: times 4 dw -6688, -3803 55 56COEF_PAIR 2896, 2896 57COEF_PAIR 1567, 3784 58COEF_PAIR 799, 4017 59COEF_PAIR 3406, 2276 60COEF_PAIR 401, 4076 61COEF_PAIR 1931, 3612 62COEF_PAIR 3166, 2598 63COEF_PAIR 3920, 1189 64COEF_PAIR 3784, 1567, 1 65COEF_PAIR 995, 3973 66COEF_PAIR 1751, 3703 67COEF_PAIR 3513, 2106 68COEF_PAIR 3857, 1380 69COEF_PAIR 4017, 799, 1 70COEF_PAIR 201, 4091 71COEF_PAIR 2440, 3290 72COEF_PAIR 3035, 2751 73COEF_PAIR 4052, 601 74COEF_PAIR 2276, 3406, 1 75COEF_PAIR 4076, 401, 2 76COEF_PAIR 2598, 3166, 2 77COEF_PAIR 3612, 1931, 2 78COEF_PAIR 1189, 3920, 2 79 80pd_2048: times 4 dd 2048 81pw_2048: times 8 dw 2048 82pw_m2048: times 8 dw -2048 83pw_4096: times 8 dw 4096 84pw_16384: times 8 dw 16384 85pw_m16384: times 8 dw -16384 86pw_1697x16: times 8 dw 1697*16 87pw_1697x8: times 8 dw 1697*8 88pw_2896x8: times 8 dw 2896*8 89pw_3344x8: times 8 dw 3344*8 90pw_8192: times 8 dw 8192 91pw_m8192: times 8 dw -8192 92pw_5: times 8 dw 5 93pw_201x8: times 8 dw 201*8 94pw_4091x8: times 8 dw 4091*8 95pw_m2751x8: times 8 dw -2751*8 96pw_3035x8: times 8 dw 3035*8 97pw_1751x8: times 8 dw 1751*8 98pw_3703x8: times 8 dw 3703*8 99pw_m1380x8: times 8 dw -1380*8 100pw_3857x8: times 8 dw 3857*8 101pw_995x8: times 8 dw 995*8 102pw_3973x8: times 8 dw 3973*8 103pw_m2106x8: times 8 dw -2106*8 104pw_3513x8: times 8 dw 3513*8 105pw_2440x8: times 8 dw 2440*8 106pw_3290x8: times 8 dw 3290*8 107pw_m601x8: times 8 dw -601*8 108pw_4052x8: times 8 dw 4052*8 109 110pw_4095x8: times 8 dw 4095*8 111pw_101x8: times 8 dw 101*8 112pw_2967x8: times 8 dw 2967*8 113pw_m2824x8: times 8 dw -2824*8 114pw_3745x8: times 8 dw 3745*8 115pw_1660x8: times 8 dw 1660*8 116pw_3822x8: times 8 dw 3822*8 117pw_m1474x8: times 8 dw -1474*8 118pw_3996x8: times 8 dw 3996*8 119pw_897x8: times 8 dw 897*8 120pw_3461x8: times 8 dw 3461*8 121pw_m2191x8: times 8 dw -2191*8 122pw_3349x8: times 8 dw 3349*8 123pw_2359x8: times 8 dw 2359*8 124pw_4036x8: times 8 dw 4036*8 125pw_m700x8: times 8 dw -700*8 126pw_4065x8: times 8 dw 4065*8 127pw_501x8: times 8 dw 501*8 128pw_3229x8: times 8 dw 3229*8 129pw_m2520x8: times 8 dw -2520*8 130pw_3564x8: times 8 dw 3564*8 131pw_2019x8: times 8 dw 2019*8 132pw_3948x8: times 8 dw 3948*8 133pw_m1092x8: times 8 dw -1092*8 134pw_3889x8: times 8 dw 3889*8 135pw_1285x8: times 8 dw 1285*8 136pw_3659x8: times 8 dw 3659*8 137pw_m1842x8: times 8 dw -1842*8 138pw_3102x8: times 8 dw 3102*8 139pw_2675x8: times 8 dw 2675*8 140pw_4085x8: times 8 dw 4085*8 141pw_m301x8: times 8 dw -301*8 142 143SECTION .text 144 145%macro REPX 2-* 146 %xdefine %%f(x) %1 147%rep %0 - 1 148 %rotate 1 149 %%f(%1) 150%endrep 151%endmacro 152 153%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) 154 155%if ARCH_X86_64 156%define o(x) x 157%else 158%define o(x) r5-$$+x ; PIC 159%endif 160 161%macro WRITE_4X4 9 ;src[1-2], tmp[1-3], row[1-4] 162 lea r2, [dstq+strideq*2] 163%assign %%i 1 164%rotate 5 165%rep 4 166 %if %1 & 2 167 CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) 168 %else 169 CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) 170 %endif 171 %assign %%i %%i + 1 172 %rotate 1 173%endrep 174 175 movd m%3, [%%row_adr1] ;dst0 176 movd m%5, [%%row_adr2] ;dst1 177 punpckldq m%3, m%5 ;high: dst1 :low: dst0 178 movd m%4, [%%row_adr3] ;dst2 179 movd m%5, [%%row_adr4] ;dst3 180 punpckldq m%4, m%5 ;high: dst3 :low: dst2 181 182 pxor m%5, m%5 183 punpcklbw m%3, m%5 ;extend byte to word 184 punpcklbw m%4, m%5 ;extend byte to word 185 186 paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0 187 paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2 188 189 packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0 190 191 movd [%%row_adr1], m%3 ;store dst0 + out0 192 pshuflw m%4, m%3, q1032 193 movd [%%row_adr2], m%4 ;store dst1 + out1 194 punpckhqdq m%3, m%3 195 movd [%%row_adr3], m%3 ;store dst2 + out2 196 psrlq m%3, 32 197 movd [%%row_adr4], m%3 ;store dst3 + out3 198%endmacro 199 200%macro ITX4_END 4-5 2048 ; row[1-4], rnd 201%if %5 202 mova m2, [o(pw_%5)] 203 pmulhrsw m0, m2 204 pmulhrsw m1, m2 205%endif 206 207 WRITE_4X4 0, 1, 2, 3, 4, %1, %2, %3, %4 208 ret 209%endmacro 210 211; flags: 1 = swap, 2: coef_regs, 4: no_pack 212%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags 213%if %6 & 2 214 pmaddwd m%2, m%4, m%1 215 pmaddwd m%1, m%5 216%elif %6 & 1 217 pmaddwd m%2, m%1, [o(pw_%5_%4)] 218 pmaddwd m%1, [o(pw_%4_m%5)] 219%else 220 pmaddwd m%2, m%1, [o(pw_%4_m%5)] 221 pmaddwd m%1, [o(pw_%5_%4)] 222%endif 223 paddd m%2, m%3 224 paddd m%1, m%3 225 psrad m%2, 12 226 psrad m%1, 12 227%if %6 & 4 == 0 228 packssdw m%1, m%2 229%endif 230%endmacro 231 232%macro IDCT4_1D_PACKED 0-1 ;pw_2896x8 233 mova m3, [o(pd_2048)] 234 punpckhwd m2, m0, m1 ;unpacked in1 in3 235 punpcklwd m0, m1 ;unpacked in0 in2 236 ITX_MUL2X_PACK 2, 1, 3, 1567, 3784 237 ITX_MUL2X_PACK 0, 1, 3, 2896, 2896 238 psubsw m1, m0, m2 ;high: out2 ;low: out3 239 paddsw m0, m2 ;high: out1 ;low: out0 240%endmacro 241 242%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack 243cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2 244 %define %%p1 m(i%1_%3_internal_8bpc) 245%if ARCH_X86_32 246 LEA r5, $$ 247%endif 248%if has_epilogue 249%ifidn %1_%2, dct_dct 250 test eobd, eobd 251 jz %%end 252%endif 253 lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] 254 call %%p1 255 RET 256%%end: 257%else 258 lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] 259%ifidn %1_%2, dct_dct 260 test eobd, eobd 261 jnz %%p1 262%else 263 times ((%%end - %%p1) >> 31) & 1 jmp %%p1 264ALIGN function_align 265%%end: 266%endif 267%endif 268%endmacro 269 270%macro INV_TXFM_4X4_FN 2 ; type1, type2 271 INV_TXFM_FN %1, %2, 4x4, 6 272%ifidn %1_%2, dct_dct 273 pshuflw m0, [coeffq], q0000 274 punpcklqdq m0, m0 275 mova m1, [o(pw_2896x8)] 276 pmulhrsw m0, m1 277 mov [coeffq], eobd ;0 278 pmulhrsw m0, m1 279 mova m1, m0 280 TAIL_CALL m(iadst_4x4_internal_8bpc).end2 281%endif 282%endmacro 283 284INIT_XMM ssse3 285; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16. 286 287INV_TXFM_4X4_FN dct, dct 288INV_TXFM_4X4_FN dct, adst 289INV_TXFM_4X4_FN dct, flipadst 290INV_TXFM_4X4_FN dct, identity 291 292cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 293 mova m0, [coeffq+16*0] ;high: in1 ;low: in0 294 mova m1, [coeffq+16*1] ;high: in3 ;low in2 295 296 IDCT4_1D_PACKED 297 298 mova m2, [o(deint_shuf)] 299 shufps m3, m0, m1, q1331 300 shufps m0, m1, q0220 301 pshufb m0, m2 ;high: in1 ;low: in0 302 pshufb m1, m3, m2 ;high: in3 ;low :in2 303 jmp tx2q 304 305.pass2: 306 IDCT4_1D_PACKED 307 308 pxor m2, m2 309 mova [coeffq+16*0], m2 310 mova [coeffq+16*1], m2 ;memset(coeff, 0, sizeof(*coeff) * sh * sw); 311 312 ITX4_END 0, 1, 3, 2 313 314INV_TXFM_4X4_FN adst, dct 315INV_TXFM_4X4_FN adst, adst 316INV_TXFM_4X4_FN adst, flipadst 317INV_TXFM_4X4_FN adst, identity 318 319cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 320 mova m0, [coeffq+16*0] 321 mova m1, [coeffq+16*1] 322 call .main 323 punpckhwd m2, m0, m1 324 punpcklwd m0, m1 325 punpckhwd m1, m0, m2 ;high: in3 ;low :in2 326 punpcklwd m0, m2 ;high: in1 ;low: in0 327 jmp tx2q 328 329.pass2: 330 call .main 331 332.end: 333 pxor m2, m2 334 mova [coeffq+16*0], m2 335 mova [coeffq+16*1], m2 336 337.end2: 338 ITX4_END 0, 1, 2, 3 339 340ALIGN function_align 341cglobal_label .main 342 punpcklwd m2, m0, m1 ;unpacked in0 in2 343 punpckhwd m0, m1 ;unpacked in1 in3 344 mova m3, m0 345 pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2 346 pmaddwd m0, [o(pw_0_3344)] ;3344 * in3 347 paddd m1, m0 ;t2 348 pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 349 pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 350 pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 351 pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3 352 paddd m4, m0 ;t0 + t3 353 pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 354 mova m0, [o(pd_2048)] 355 paddd m1, m0 ;t2 + 2048 356 paddd m2, m0 357 paddd m0, m4 ;t0 + t3 + 2048 358 paddd m5, m2 ;t1 + t3 + 2048 359 paddd m2, m4 360 paddd m2, m3 ;t0 + t1 - t3 + 2048 361 REPX {psrad x, 12}, m1, m0, m5, m2 362 packssdw m0, m5 ;high: out1 ;low: out0 363 packssdw m1, m2 ;high: out3 ;low: out3 364 ret 365 366INV_TXFM_4X4_FN flipadst, dct 367INV_TXFM_4X4_FN flipadst, adst 368INV_TXFM_4X4_FN flipadst, flipadst 369INV_TXFM_4X4_FN flipadst, identity 370 371cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 372 mova m0, [coeffq+16*0] 373 mova m1, [coeffq+16*1] 374 call m(iadst_4x4_internal_8bpc).main 375 punpcklwd m2, m1, m0 376 punpckhwd m1, m0 377 punpcklwd m0, m1, m2 ;high: in3 ;low :in2 378 punpckhwd m1, m2 ;high: in1 ;low: in0 379 jmp tx2q 380 381.pass2: 382 call m(iadst_4x4_internal_8bpc).main 383 384.end: 385 pxor m2, m2 386 mova [coeffq+16*0], m2 387 mova [coeffq+16*1], m2 388 389.end2: 390 ITX4_END 3, 2, 1, 0 391 392INV_TXFM_4X4_FN identity, dct 393INV_TXFM_4X4_FN identity, adst 394INV_TXFM_4X4_FN identity, flipadst 395INV_TXFM_4X4_FN identity, identity 396 397cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 398 mova m0, [coeffq+16*0] 399 mova m1, [coeffq+16*1] 400 mova m3, [o(pw_1697x8)] 401 pmulhrsw m2, m0, m3 402 pmulhrsw m3, m1 403 paddsw m0, m2 404 paddsw m1, m3 405 punpckhwd m2, m0, m1 406 punpcklwd m0, m1 407 punpckhwd m1, m0, m2 ;high: in3 ;low :in2 408 punpcklwd m0, m2 ;high: in1 ;low: in0 409 jmp tx2q 410 411.pass2: 412 mova m3, [o(pw_1697x8)] 413 pmulhrsw m2, m3, m0 414 pmulhrsw m3, m1 415 paddsw m0, m2 416 paddsw m1, m3 417 jmp m(iadst_4x4_internal_8bpc).end 418 419%macro IWHT4_1D_PACKED 0 420 punpckhqdq m3, m0, m1 ;low: in1 high: in3 421 punpcklqdq m0, m1 ;low: in0 high: in2 422 psubw m2, m0, m3 ;low: in0 - in1 high: in2 - in3 423 paddw m0, m3 ;low: in0 + in1 high: in2 + in3 424 punpckhqdq m2, m2 ;t2 t2 425 punpcklqdq m0, m0 ;t0 t0 426 psubw m1, m0, m2 427 psraw m1, 1 ;t4 t4 428 psubw m1, m3 ;low: t1/out2 high: t3/out1 429 psubw m0, m1 ;high: out0 430 paddw m2, m1 ;low: out3 431%endmacro 432 433cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff 434 mova m0, [coeffq+16*0] 435 mova m1, [coeffq+16*1] 436 pxor m2, m2 437 mova [coeffq+16*0], m2 438 mova [coeffq+16*1], m2 439 psraw m0, 2 440 psraw m1, 2 441 442 IWHT4_1D_PACKED 443 444 punpckhwd m0, m1 445 punpcklwd m3, m1, m2 446 punpckhdq m1, m0, m3 447 punpckldq m0, m3 448 449 IWHT4_1D_PACKED 450 451 shufpd m0, m2, 0x01 452 ITX4_END 0, 3, 2, 1, 0 453 454 455%macro IDCT8_1D_PACKED 0 456 mova m6, [o(pd_2048)] 457 punpckhwd m4, m0, m3 ;unpacked in1 in7 458 punpcklwd m0, m2 ;unpacked in0 in4 459 punpckhwd m2, m1 ;unpacked in5 in3 460 punpcklwd m1, m3 ;unpacked in2 in6 461 ITX_MUL2X_PACK 4, 3, 6, 799, 4017 ;low: t7a high: t4a 462 ITX_MUL2X_PACK 2, 3, 6, 3406, 2276 ;low: t6a high: t5a 463 ITX_MUL2X_PACK 1, 3, 6, 1567, 3784 ;low: t3 high: t2 464 psubsw m3, m4, m2 ;low: t6a high: t5a 465 paddsw m4, m2 ;low: t7 high: t4 466 pshufb m3, [o(deint_shuf1)] 467 ITX_MUL2X_PACK 0, 2, 6, 2896, 2896 ;low: t0 high: t1 468 ITX_MUL2X_PACK 3, 2, 6, 2896, 2896 ;low: t6 high: t5 469 psubsw m2, m0, m1 ;low: tmp3 high: tmp2 470 paddsw m0, m1 ;low: tmp0 high: tmp1 471 punpcklqdq m1, m4, m3 ;low: t7 high: t6 472 punpckhqdq m4, m3 ;low: t4 high: t5 473 psubsw m3, m0, m1 ;low: out7 high: out6 474 paddsw m0, m1 ;low: out0 high: out1 475 paddsw m1, m2, m4 ;low: out3 high: out2 476 psubsw m2, m4 ;low: out4 high: out5 477%endmacro 478 479;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 480;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 481%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1 482 punpckhwd m%4, m%1, m%2 483 punpcklwd m%1, m%2 484%if %7 < 8 485 pmaddwd m%2, m%7, m%1 486 pmaddwd m%3, m%7, m%4 487%else 488 mova m%2, [o(pw_%7_%6)] 489%if %8 490 pmaddwd m%3, m%1, m%2 491 pmaddwd m%2, m%4 492%else 493 pmaddwd m%3, m%4, m%2 494 pmaddwd m%2, m%1 495%endif 496%endif 497 paddd m%3, m%5 498 paddd m%2, m%5 499 psrad m%3, 12 500 psrad m%2, 12 501%if %8 502 packssdw m%3, m%2 503%else 504 packssdw m%2, m%3 ;dst2 505%endif 506%if %7 < 8 507 pmaddwd m%4, m%6 508 pmaddwd m%1, m%6 509%elif %8 510 mova m%2, [o(pw_%6_m%7)] 511 pmaddwd m%4, m%2 512 pmaddwd m%1, m%2 513%else 514 mova m%3, [o(pw_%6_m%7)] 515 pmaddwd m%4, m%3 516 pmaddwd m%1, m%3 517%endif 518 paddd m%4, m%5 519 paddd m%1, m%5 520 psrad m%4, 12 521 psrad m%1, 12 522 packssdw m%1, m%4 ;dst1 523%endmacro 524 525%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 526 ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3 527 ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0 528 psubsw m%3, m%1, m%2 ;out2 529 paddsw m%2, m%1 ;out1 530 paddsw m%1, m%5, m%4 ;out0 531 psubsw m%4, m%5 ;out3 532%endmacro 533 534%macro WRITE_4X8 4 ;row[1-4] 535 WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4 536 lea dstq, [dstq+strideq*4] 537 WRITE_4X4 2, 3, 4, 5, 6, %1, %2, %3, %4 538%endmacro 539 540%macro INV_4X8 0 541 punpckhwd m4, m2, m3 542 punpcklwd m2, m3 543 punpckhwd m3, m0, m1 544 punpcklwd m0, m1 545 punpckhdq m1, m0, m2 ;low: in2 high: in3 546 punpckldq m0, m2 ;low: in0 high: in1 547 punpckldq m2, m3, m4 ;low: in4 high: in5 548 punpckhdq m3, m4 ;low: in6 high: in7 549%endmacro 550 551%macro INV_TXFM_4X8_FN 2 ; type1, type2 552 INV_TXFM_FN %1, %2, 4x8, 8 553%ifidn %1_%2, dct_dct 554 pshuflw m0, [coeffq], q0000 555 punpcklqdq m0, m0 556 mova m1, [o(pw_2896x8)] 557 pmulhrsw m0, m1 558 mov [coeffq], eobd 559 pmulhrsw m0, m1 560 pmulhrsw m0, m1 561 pmulhrsw m0, [o(pw_2048)] 562 mova m1, m0 563 mova m2, m0 564 mova m3, m0 565 TAIL_CALL m(iadst_4x8_internal_8bpc).end3 566%endif 567%endmacro 568 569INV_TXFM_4X8_FN dct, dct 570INV_TXFM_4X8_FN dct, adst 571INV_TXFM_4X8_FN dct, flipadst 572INV_TXFM_4X8_FN dct, identity 573 574cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 575 mova m3, [o(pw_2896x8)] 576 pmulhrsw m0, m3, [coeffq+16*0] 577 pmulhrsw m1, m3, [coeffq+16*1] 578 pmulhrsw m2, m3, [coeffq+16*2] 579 pmulhrsw m3, [coeffq+16*3] 580 581.pass1: 582 call m(idct_8x4_internal_8bpc).main 583 jmp m(iadst_4x8_internal_8bpc).pass1_end 584 585.pass2: 586 call .main 587 shufps m1, m1, q1032 588 shufps m3, m3, q1032 589 mova m4, [o(pw_2048)] 590 jmp m(iadst_4x8_internal_8bpc).end2 591 592ALIGN function_align 593cglobal_label .main 594 IDCT8_1D_PACKED 595 ret 596 597 598INV_TXFM_4X8_FN adst, dct 599INV_TXFM_4X8_FN adst, adst 600INV_TXFM_4X8_FN adst, flipadst 601INV_TXFM_4X8_FN adst, identity 602 603cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 604 mova m3, [o(pw_2896x8)] 605 pmulhrsw m0, m3, [coeffq+16*0] 606 pmulhrsw m1, m3, [coeffq+16*1] 607 pmulhrsw m2, m3, [coeffq+16*2] 608 pmulhrsw m3, [coeffq+16*3] 609 610.pass1: 611 call m(iadst_8x4_internal_8bpc).main 612 613.pass1_end: 614 INV_4X8 615 jmp tx2q 616 617.pass2: 618 shufps m0, m0, q1032 619 shufps m1, m1, q1032 620 call .main 621 mova m4, [o(pw_2048)] 622 pxor m5, m5 623 psubw m5, m4 624 625.end: 626 punpcklqdq m4, m5 627 628.end2: 629 pmulhrsw m0, m4 630 pmulhrsw m1, m4 631 pmulhrsw m2, m4 632 pmulhrsw m3, m4 633 pxor m5, m5 634 mova [coeffq+16*0], m5 635 mova [coeffq+16*1], m5 636 mova [coeffq+16*2], m5 637 mova [coeffq+16*3], m5 638 639.end3: 640 WRITE_4X8 0, 1, 2, 3 641 RET 642 643ALIGN function_align 644cglobal_label .main 645 mova m6, [o(pd_2048)] 646 punpckhwd m4, m3, m0 ;unpacked in7 in0 647 punpckhwd m5, m2, m1 ;unpacked in5 in2 648 punpcklwd m1, m2 ;unpacked in3 in4 649 punpcklwd m0, m3 ;unpacked in1 in6 650 ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a 651 ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a 652 ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a 653 ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a 654 655 psubsw m3, m4, m1 ;low: t4 high: t5 656 paddsw m4, m1 ;low: t0 high: t1 657 psubsw m2, m5, m0 ;low: t6 high: t7 658 paddsw m5, m0 ;low: t2 high: t3 659 660 shufps m1, m3, m2, q1032 661 punpckhwd m2, m1 662 punpcklwd m3, m1 663 ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a 664 ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a 665 666 psubsw m1, m4, m5 ;low: t2 high: t3 667 paddsw m4, m5 ;low: out0 high: -out7 668 psubsw m5, m3, m2 ;low: t7 high: t6 669 paddsw m3, m2 ;low: out6 high: -out1 670 shufps m0, m4, m3, q3210 ;low: out0 high: -out1 671 shufps m3, m4, q3210 ;low: out6 high: -out7 672 673 mova m2, [o(pw_2896_m2896)] 674 mova m7, [o(pw_2896_2896)] 675 shufps m4, m1, m5, q1032 ;low: t3 high: t7 676 shufps m1, m5, q3210 ;low: t2 high: t6 677 punpcklwd m5, m1, m4 678 punpckhwd m1, m4 679 pmaddwd m4, m2, m1 ;-out5 680 pmaddwd m2, m5 ; out4 681 pmaddwd m1, m7 ; out2 682 pmaddwd m5, m7 ;-out3 683 REPX {paddd x, m6}, m4, m2, m1, m5 684 REPX {psrad x, 12}, m4, m2, m1, m5 685 packssdw m1, m5 ;low: out2 high: -out3 686 packssdw m2, m4 ;low: out4 high: -out5 687 ret 688 689INV_TXFM_4X8_FN flipadst, dct 690INV_TXFM_4X8_FN flipadst, adst 691INV_TXFM_4X8_FN flipadst, flipadst 692INV_TXFM_4X8_FN flipadst, identity 693 694cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 695 mova m3, [o(pw_2896x8)] 696 pmulhrsw m0, m3, [coeffq+16*0] 697 pmulhrsw m1, m3, [coeffq+16*1] 698 pmulhrsw m2, m3, [coeffq+16*2] 699 pmulhrsw m3, [coeffq+16*3] 700 701.pass1: 702 call m(iadst_8x4_internal_8bpc).main 703 704 punpcklwd m4, m3, m2 705 punpckhwd m3, m2 706 punpcklwd m5, m1, m0 707 punpckhwd m1, m0 708 punpckldq m2, m3, m1 ;low: in4 high: in5 709 punpckhdq m3, m1 ;low: in6 high: in7 710 punpckldq m0, m4, m5 ;low: in0 high: in1 711 punpckhdq m1, m4, m5 ;low: in2 high: in3 712 jmp tx2q 713 714.pass2: 715 shufps m0, m0, q1032 716 shufps m1, m1, q1032 717 call m(iadst_4x8_internal_8bpc).main 718 719 mova m4, m0 720 mova m5, m1 721 pshufd m0, m3, q1032 722 pshufd m1, m2, q1032 723 pshufd m2, m5, q1032 724 pshufd m3, m4, q1032 725 mova m5, [o(pw_2048)] 726 pxor m4, m4 727 psubw m4, m5 728 jmp m(iadst_4x8_internal_8bpc).end 729 730INV_TXFM_4X8_FN identity, dct 731INV_TXFM_4X8_FN identity, adst 732INV_TXFM_4X8_FN identity, flipadst 733INV_TXFM_4X8_FN identity, identity 734 735cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 736 mova m3, [o(pw_2896x8)] 737 pmulhrsw m0, m3, [coeffq+16*0] 738 pmulhrsw m1, m3, [coeffq+16*1] 739 pmulhrsw m2, m3, [coeffq+16*2] 740 pmulhrsw m3, [coeffq+16*3] 741 742.pass1: 743 mova m7, [o(pw_1697x8)] 744 pmulhrsw m4, m7, m0 745 pmulhrsw m5, m7, m1 746 pmulhrsw m6, m7, m2 747 pmulhrsw m7, m3 748 paddsw m0, m4 749 paddsw m1, m5 750 paddsw m2, m6 751 paddsw m3, m7 752 jmp m(iadst_4x8_internal_8bpc).pass1_end 753 754.pass2: 755 mova m4, [o(pw_4096)] 756 jmp m(iadst_4x8_internal_8bpc).end2 757 758 759%macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3] 760 movq m%3, [dstq ] 761 movq m%4, [dstq+strideq] 762 pxor m%5, m%5 763 punpcklbw m%3, m%5 ;extend byte to word 764 punpcklbw m%4, m%5 ;extend byte to word 765%ifnum %1 766 paddw m%3, m%1 767%else 768 paddw m%3, %1 769%endif 770%ifnum %2 771 paddw m%4, m%2 772%else 773 paddw m%4, %2 774%endif 775 packuswb m%3, m%4 776 movq [dstq ], m%3 777 punpckhqdq m%3, m%3 778 movq [dstq+strideq], m%3 779%endmacro 780 781%macro WRITE_8X4 7 ;coefs[1-4], tmp[1-3] 782 WRITE_8X2 %1, %2, %5, %6, %7 783 lea dstq, [dstq+strideq*2] 784 WRITE_8X2 %3, %4, %5, %6, %7 785%endmacro 786 787%macro INV_TXFM_8X4_FN 2 ; type1, type2 788 INV_TXFM_FN %1, %2, 8x4, 8 789%ifidn %1_%2, dct_dct 790 pshuflw m0, [coeffq], q0000 791 punpcklqdq m0, m0 792 mova m1, [o(pw_2896x8)] 793 pmulhrsw m0, m1 794 pmulhrsw m0, m1 795 mova m2, [o(pw_2048)] 796 pmulhrsw m0, m1 797 pmulhrsw m0, m2 798 mova m1, m0 799 mova m2, m0 800 mova m3, m0 801 TAIL_CALL m(iadst_8x4_internal_8bpc).end2 802%endif 803%endmacro 804 805INV_TXFM_8X4_FN dct, dct 806INV_TXFM_8X4_FN dct, adst 807INV_TXFM_8X4_FN dct, flipadst 808INV_TXFM_8X4_FN dct, identity 809 810cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 811 mova m3, [o(pw_2896x8)] 812 pmulhrsw m0, m3, [coeffq+16*0] 813 pmulhrsw m1, m3, [coeffq+16*1] 814 pmulhrsw m2, m3, [coeffq+16*2] 815 pmulhrsw m3, [coeffq+16*3] 816 817 call m(idct_4x8_internal_8bpc).main 818 819 mova m4, [o(deint_shuf1)] 820 mova m5, [o(deint_shuf2)] 821 pshufb m0, m4 822 pshufb m1, m5 823 pshufb m2, m4 824 pshufb m3, m5 825 punpckhdq m4, m0, m1 826 punpckldq m0, m1 827 punpckhdq m5, m2, m3 828 punpckldq m2, m3 829 punpckhqdq m1, m0, m2 ;in1 830 punpcklqdq m0, m2 ;in0 831 punpckhqdq m3, m4, m5 ;in3 832 punpcklqdq m2 ,m4, m5 ;in2 833 jmp tx2q 834 835.pass2: 836 call .main 837 jmp m(iadst_8x4_internal_8bpc).end 838 839ALIGN function_align 840cglobal_label .main 841 mova m6, [o(pd_2048)] 842 IDCT4_1D 0, 1, 2, 3, 4, 5, 6 843 ret 844 845INV_TXFM_8X4_FN adst, dct 846INV_TXFM_8X4_FN adst, adst 847INV_TXFM_8X4_FN adst, flipadst 848INV_TXFM_8X4_FN adst, identity 849 850cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 851 mova m3, [o(pw_2896x8)] 852 pmulhrsw m0, m3, [coeffq+16*0] 853 pmulhrsw m1, m3, [coeffq+16*1] 854 pmulhrsw m2, m3, [coeffq+16*2] 855 pmulhrsw m3, [coeffq+16*3] 856 857 shufps m0, m0, q1032 858 shufps m1, m1, q1032 859 call m(iadst_4x8_internal_8bpc).main 860 861 punpckhwd m4, m0, m1 862 punpcklwd m0, m1 863 punpckhwd m1, m2, m3 864 punpcklwd m2, m3 865 pxor m5, m5 866 psubsw m3, m5, m1 867 psubsw m5, m4 868 punpckhdq m4, m5, m3 869 punpckldq m5, m3 870 punpckhdq m3, m0, m2 871 punpckldq m0, m2 872 punpckhwd m1, m0, m5 ;in1 873 punpcklwd m0, m5 ;in0 874 punpcklwd m2, m3, m4 ;in2 875 punpckhwd m3, m4 ;in3 876 jmp tx2q 877 878.pass2: 879 call .main 880 881.end: 882 mova m4, [o(pw_2048)] 883 pmulhrsw m0, m4 884 pmulhrsw m1, m4 885 pmulhrsw m2, m4 886 pmulhrsw m3, m4 887 888.end2: 889 pxor m6, m6 890 mova [coeffq+16*0], m6 891 mova [coeffq+16*1], m6 892 mova [coeffq+16*2], m6 893 mova [coeffq+16*3], m6 894.end3: 895 WRITE_8X4 0, 1, 2, 3, 4, 5, 6 896 RET 897 898ALIGN function_align 899cglobal_label .main 900 punpckhwd m6, m0, m2 ;unpacked in0 in2 901 punpcklwd m0, m2 ;unpacked in0 in2 902 punpckhwd m7, m1, m3 ;unpacked in1 in3 903 punpcklwd m1, m3 ;unpacked in1 in3 904 905 mova m2, [o(pw_3344_m3344)] 906 mova m4, [o(pw_0_3344)] 907 pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2 908 pmaddwd m5, m4, m7 ;3344 * in3 909 pmaddwd m2, m0 910 pmaddwd m4, m1 911 paddd m3, m5 912 paddd m2, m4 913 mova m4, [o(pd_2048)] 914 paddd m3, m4 ;t2 + 2048 915 paddd m2, m4 916 psrad m3, 12 917 psrad m2, 12 918 packssdw m2, m3 ;out2 919 920 pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 921 pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 922 pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 923 pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 924 paddd m3, m4 ;t0 + t3 925 926 pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 927 mova m4, [o(pd_2048)] 928 paddd m0, m4 929 paddd m4, m3 ;t0 + t3 + 2048 930 paddd m5, m0 ;t1 + t3 + 2048 931 paddd m3, m0 932 paddd m3, m1 ;t0 + t1 - t3 + 2048 933 934 psrad m4, 12 ;out0 935 psrad m5, 12 ;out1 936 psrad m3, 12 ;out3 937 packssdw m0, m4, m5 ;low: out0 high: out1 938 939 pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 940 pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 941 pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 942 pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 943 paddd m1, m4 ;t0 + t3 944 pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 945 946 mova m4, [o(pd_2048)] 947 paddd m6, m4 948 paddd m4, m1 ;t0 + t3 + 2048 949 paddd m5, m6 ;t1 + t3 + 2048 950 paddd m1, m6 951 paddd m1, m7 ;t0 + t1 - t3 + 2048 952 953 psrad m4, 12 ;out0 954 psrad m5, 12 ;out1 955 psrad m1, 12 ;out3 956 packssdw m3, m1 ;out3 957 packssdw m4, m5 ;low: out0 high: out1 958 959 punpckhqdq m1, m0, m4 ;out1 960 punpcklqdq m0, m4 ;out0 961 ret 962 963INV_TXFM_8X4_FN flipadst, dct 964INV_TXFM_8X4_FN flipadst, adst 965INV_TXFM_8X4_FN flipadst, flipadst 966INV_TXFM_8X4_FN flipadst, identity 967 968cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 969 mova m3, [o(pw_2896x8)] 970 pmulhrsw m0, m3, [coeffq+16*0] 971 pmulhrsw m1, m3, [coeffq+16*1] 972 pmulhrsw m2, m3, [coeffq+16*2] 973 pmulhrsw m3, [coeffq+16*3] 974 975 shufps m0, m0, q1032 976 shufps m1, m1, q1032 977 call m(iadst_4x8_internal_8bpc).main 978 979 punpckhwd m5, m3, m2 980 punpcklwd m3, m2 981 punpckhwd m2, m1, m0 982 punpcklwd m1, m0 983 984 pxor m0, m0 985 psubsw m4, m0, m2 986 psubsw m0, m5 987 punpckhdq m2, m0, m4 988 punpckldq m0, m4 989 punpckhdq m4, m3, m1 990 punpckldq m3, m1 991 punpckhwd m1, m0, m3 ;in1 992 punpcklwd m0, m3 ;in0 993 punpckhwd m3, m2, m4 ;in3 994 punpcklwd m2, m4 ;in2 995 jmp tx2q 996 997.pass2: 998 call m(iadst_8x4_internal_8bpc).main 999 mova m4, m0 1000 mova m5, m1 1001 mova m0, m3 1002 mova m1, m2 1003 mova m2, m5 1004 mova m3, m4 1005 jmp m(iadst_8x4_internal_8bpc).end 1006 1007INV_TXFM_8X4_FN identity, dct 1008INV_TXFM_8X4_FN identity, adst 1009INV_TXFM_8X4_FN identity, flipadst 1010INV_TXFM_8X4_FN identity, identity 1011 1012cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1013 mova m3, [o(pw_2896x8)] 1014 pmulhrsw m0, m3, [coeffq+16*0] 1015 pmulhrsw m1, m3, [coeffq+16*1] 1016 pmulhrsw m2, m3, [coeffq+16*2] 1017 pmulhrsw m3, [coeffq+16*3] 1018 paddsw m0, m0 1019 paddsw m1, m1 1020 paddsw m2, m2 1021 paddsw m3, m3 1022 1023 punpckhwd m4, m0, m1 1024 punpcklwd m0, m1 1025 punpckhwd m1, m2, m3 1026 punpcklwd m2, m3 1027 punpckhdq m5, m4, m1 1028 punpckldq m4, m1 1029 punpckhdq m3, m0, m2 1030 punpckldq m0, m2 1031 punpckhwd m1, m0, m4 ;in1 1032 punpcklwd m0, m4 ;in0 1033 punpcklwd m2, m3, m5 ;in2 1034 punpckhwd m3, m5 ;in3 1035 jmp tx2q 1036 1037.pass2: 1038 mova m7, [o(pw_1697x8)] 1039 pmulhrsw m4, m7, m0 1040 pmulhrsw m5, m7, m1 1041 pmulhrsw m6, m7, m2 1042 pmulhrsw m7, m3 1043 paddsw m0, m4 1044 paddsw m1, m5 1045 paddsw m2, m6 1046 paddsw m3, m7 1047 jmp m(iadst_8x4_internal_8bpc).end 1048 1049%macro INV_TXFM_8X8_FN 2 ; type1, type2 1050 INV_TXFM_FN %1, %2, 8x8, 8, 16*4 1051%ifidn %1_%2, dct_dct 1052 pshuflw m0, [coeffq], q0000 1053 punpcklwd m0, m0 1054 mova m1, [o(pw_2896x8)] 1055 pmulhrsw m0, m1 1056 mova m2, [o(pw_16384)] 1057 mov [coeffq], eobd 1058 pmulhrsw m0, m2 1059 psrlw m2, 3 1060 pmulhrsw m0, m1 1061 pmulhrsw m0, m2 1062.end: 1063 mov r3d, 2 1064 lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)] 1065.loop: 1066 WRITE_8X4 0, 0, 0, 0, 1, 2, 3 1067 lea dstq, [dstq+strideq*2] 1068 dec r3d 1069 jg .loop 1070 jmp tx2q 1071.end3: 1072 RET 1073%endif 1074%endmacro 1075 1076%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 1077%if %3 1078 mova m7, [o(pw_2896x8)] 1079 pmulhrsw m0, m7, [%1+%2*0] 1080 pmulhrsw m1, m7, [%1+%2*1] 1081 pmulhrsw m2, m7, [%1+%2*2] 1082 pmulhrsw m3, m7, [%1+%2*3] 1083 pmulhrsw m4, m7, [%1+%2*4] 1084 pmulhrsw m5, m7, [%1+%2*5] 1085 pmulhrsw m6, m7, [%1+%2*6] 1086 pmulhrsw m7, [%1+%2*7] 1087%else 1088 mova m0, [%1+%2*0] 1089 mova m1, [%1+%2*1] 1090 mova m2, [%1+%2*2] 1091 mova m3, [%1+%2*3] 1092 mova m4, [%1+%2*4] 1093 mova m5, [%1+%2*5] 1094 mova m6, [%1+%2*6] 1095 mova m7, [%1+%2*7] 1096%endif 1097%endmacro 1098 1099%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048 1100 ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a 1101 ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a 1102 psubsw m%2, m%4, m%5 ;t6a 1103 paddsw m%4, m%5 ;t7 1104 psubsw m%5, m%1, m%3 ;t5a 1105 paddsw m%1, m%3 ;t4 1106 ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6 1107%endmacro 1108 1109INV_TXFM_8X8_FN dct, dct 1110INV_TXFM_8X8_FN dct, adst 1111INV_TXFM_8X8_FN dct, flipadst 1112INV_TXFM_8X8_FN dct, identity 1113 1114cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1115 LOAD_8ROWS coeffq, 16 1116 1117.pass1: 1118 call .main 1119 1120.pass1_end: 1121 mova m7, [o(pw_16384)] 1122 1123.pass1_end1: 1124 REPX {pmulhrsw x, m7}, m0, m2, m4, m6 1125 mova [rsp+gprsize+16*1], m6 1126 1127.pass1_end2: 1128 REPX {pmulhrsw x, m7}, m1, m3, m5 1129 pmulhrsw m7, [rsp+gprsize+16*0] 1130 1131cglobal_label .pass1_end3 1132 punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53 1133 punpckhwd m1, m5 ;14 54 15 55 16 56 17 57 1134 punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47 1135 punpcklwd m0, m4 ;00 40 01 41 02 42 03 43 1136 punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77 1137 punpcklwd m3, m7 ;30 70 31 71 32 72 33 73 1138 punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77 1139 punpcklwd m1, m4 ;14 34 54 74 15 35 55 75 1140 punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73 1141 punpcklwd m6, m3 ;10 30 50 70 11 31 51 71 1142 mova [rsp+gprsize+16*2], m6 1143 mova m6, [rsp+gprsize+16*1] 1144 punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67 1145 punpcklwd m2, m6 ;20 60 21 61 22 62 23 63 1146 punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67 1147 punpcklwd m5, m3 ;04 24 44 64 05 25 45 65 1148 punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63 1149 punpcklwd m0, m2 ;00 20 40 60 01 21 41 61 1150 1151 punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77 1152 punpcklwd m6, m7 ;06 16 26 36 46 56 66 76 1153 mova [rsp+gprsize+16*0], m2 1154 punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72 1155 punpckhwd m3, m4 ;03 13 23 33 43 53 63 73 1156 punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74 1157 punpckhwd m5, m1 ;05 15 25 35 45 55 65 75 1158 mova m7, [rsp+gprsize+16*2] 1159 punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71 1160 punpcklwd m0, m7 ;00 10 20 30 40 50 60 70 1161 mova m7, [rsp+gprsize+16*0] 1162 jmp tx2q 1163 1164.pass2: 1165 lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] 1166 1167.pass2_main: 1168 call .main 1169 1170.end: 1171 mova m7, [o(pw_2048)] 1172 REPX {pmulhrsw x, m7}, m0, m2, m4, m6 1173 mova [rsp+gprsize+16*1], m6 1174 1175.end2: 1176 REPX {pmulhrsw x, m7}, m1, m3, m5 1177 pmulhrsw m7, [rsp+gprsize+16*0] 1178 mova [rsp+gprsize+16*2], m5 1179 mova [rsp+gprsize+16*0], m7 1180 1181.end3: 1182 WRITE_8X4 0, 1, 2, 3, 5, 6, 7 1183 lea dstq, [dstq+strideq*2] 1184 WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7 1185 jmp tx2q 1186 1187.end4: 1188 pxor m7, m7 1189 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 1190 ret 1191 1192ALIGN function_align 1193cglobal_label .main 1194 mova [rsp+gprsize*2+16*0], m7 1195 mova [rsp+gprsize*2+16*1], m3 1196 mova [rsp+gprsize*2+16*2], m1 1197 mova m7, [o(pd_2048)] 1198 IDCT4_1D 0, 2, 4, 6, 1, 3, 7 1199 mova m3, [rsp+gprsize*2+16*2] 1200 mova [rsp+gprsize*2+16*2], m2 1201 mova m2, [rsp+gprsize*2+16*1] 1202 mova [rsp+gprsize*2+16*1], m4 1203 mova m4, [rsp+gprsize*2+16*0] 1204 mova [rsp+gprsize*2+16*0], m6 1205 IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7 1206 mova m6, [rsp+gprsize*2+16*0] 1207 psubsw m7, m0, m4 ;out7 1208 paddsw m0, m4 ;out0 1209 mova [rsp+gprsize*2+16*0], m7 1210 mova m1, [rsp+gprsize*2+16*2] 1211 psubsw m4, m6, m3 ;out4 1212 paddsw m3, m6 ;out3 1213 mova m7, [rsp+gprsize*2+16*1] 1214 psubsw m6, m1, m5 ;out6 1215 paddsw m1, m5 ;out1 1216 psubsw m5, m7, m2 ;out5 1217 paddsw m2, m7 ;out2 1218 ret 1219 1220 1221INV_TXFM_8X8_FN adst, dct 1222INV_TXFM_8X8_FN adst, adst 1223INV_TXFM_8X8_FN adst, flipadst 1224INV_TXFM_8X8_FN adst, identity 1225 1226cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1227 LOAD_8ROWS coeffq, 16 1228 1229.pass1: 1230 call .main 1231 call .main_pass1_end 1232 1233.pass1_end: 1234 mova m7, [o(pw_16384)] 1235 1236.pass1_end1: 1237 REPX {pmulhrsw x, m7}, m0, m2, m4, m6 1238 mova [rsp+gprsize+16*1], m6 1239 pxor m6, m6 1240 psubw m6, m7 1241 mova m7, m6 1242 jmp m(idct_8x8_internal_8bpc).pass1_end2 1243 1244ALIGN function_align 1245.pass2: 1246 lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] 1247 1248.pass2_main: 1249 call .main 1250 call .main_pass2_end 1251 1252.end: 1253 mova m7, [o(pw_2048)] 1254 REPX {pmulhrsw x, m7}, m0, m2, m4, m6 1255 mova [rsp+gprsize+16*1], m6 1256 pxor m6, m6 1257 psubw m6, m7 1258 mova m7, m6 1259 jmp m(idct_8x8_internal_8bpc).end2 1260 1261ALIGN function_align 1262cglobal_label .main 1263 mova [rsp+gprsize*2+16*0], m7 1264 mova [rsp+gprsize*2+16*1], m3 1265 mova [rsp+gprsize*2+16*2], m4 1266 mova m7, [o(pd_2048)] 1267 ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a 1268 ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a 1269 paddsw m3, m2, m6 ;t2 1270 psubsw m2, m6 ;t6 1271 paddsw m4, m5, m1 ;t3 1272 psubsw m5, m1 ;t7 1273 ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a 1274 1275 mova m6, [rsp+gprsize*2+16*2] 1276 mova [rsp+gprsize*2+16*2], m5 1277 mova m1, [rsp+gprsize*2+16*1] 1278 mova [rsp+gprsize*2+16*1], m2 1279 mova m5, [rsp+gprsize*2+16*0] 1280 mova [rsp+gprsize*2+16*0], m3 1281 ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a 1282 ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a 1283 psubsw m2, m0, m6 ;t4 1284 paddsw m0, m6 ;t0 1285 paddsw m3, m5, m1 ;t1 1286 psubsw m5, m1 ;t5 1287 ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a 1288 1289 mova m7, [rsp+gprsize*2+16*0] 1290 paddsw m1, m3, m4 ;-out7 1291 psubsw m3, m4 ;t3 1292 mova [rsp+gprsize*2+16*0], m1 1293 psubsw m4, m0, m7 ;t2 1294 paddsw m0, m7 ;out0 1295 mova m6, [rsp+gprsize*2+16*2] 1296 mova m7, [rsp+gprsize*2+16*1] 1297 paddsw m1, m5, m6 ;-out1 1298 psubsw m5, m6 ;t6 1299 paddsw m6, m2, m7 ;out6 1300 psubsw m2, m7 ;t7 1301 ret 1302ALIGN function_align 1303.main_pass1_end: 1304 mova [rsp+gprsize*2+16*1], m1 1305 mova [rsp+gprsize*2+16*2], m6 1306 punpckhwd m1, m4, m3 1307 punpcklwd m4, m3 1308 punpckhwd m7, m5, m2 1309 punpcklwd m5, m2 1310 mova m2, [o(pw_2896_2896)] 1311 mova m6, [o(pd_2048)] 1312 pmaddwd m3, m2, m7 1313 pmaddwd m2, m5 1314 paddd m3, m6 1315 paddd m2, m6 1316 psrad m3, 12 1317 psrad m2, 12 1318 packssdw m2, m3 ;out2 1319 mova m3, [o(pw_2896_m2896)] 1320 pmaddwd m7, m3 1321 pmaddwd m5, m3 1322 paddd m7, m6 1323 paddd m5, m6 1324 psrad m7, 12 1325 psrad m5, 12 1326 packssdw m5, m7 ;-out5 1327 mova m3, [o(pw_2896_2896)] 1328 pmaddwd m7, m3, m1 1329 pmaddwd m3, m4 1330 paddd m7, m6 1331 paddd m3, m6 1332 psrad m7, 12 1333 psrad m3, 12 1334 packssdw m3, m7 ;-out3 1335 mova m7, [o(pw_2896_m2896)] 1336 pmaddwd m1, m7 1337 pmaddwd m4, m7 1338 paddd m1, m6 1339 paddd m4, m6 1340 psrad m1, 12 1341 psrad m4, 12 1342 packssdw m4, m1 ;-out5 1343 mova m1, [rsp+gprsize*2+16*1] 1344 mova m6, [rsp+gprsize*2+16*2] 1345 ret 1346ALIGN function_align 1347cglobal_label .main_pass2_end 1348 paddsw m7, m4, m3 ;t2 + t3 1349 psubsw m4, m3 ;t2 - t3 1350 paddsw m3, m5, m2 ;t6 + t7 1351 psubsw m5, m2 ;t6 - t7 1352 mova m2, [o(pw_2896x8)] 1353 pmulhrsw m4, m2 ;out4 1354 pmulhrsw m5, m2 ;-out5 1355 pmulhrsw m7, m2 ;-out3 1356 pmulhrsw m2, m3 ;out2 1357 mova m3, m7 1358 ret 1359 1360INV_TXFM_8X8_FN flipadst, dct 1361INV_TXFM_8X8_FN flipadst, adst 1362INV_TXFM_8X8_FN flipadst, flipadst 1363INV_TXFM_8X8_FN flipadst, identity 1364 1365cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1366 LOAD_8ROWS coeffq, 16 1367 1368.pass1: 1369 call m(iadst_8x8_internal_8bpc).main 1370 call m(iadst_8x8_internal_8bpc).main_pass1_end 1371 1372.pass1_end: 1373 mova m7, [o(pw_m16384)] 1374 1375.pass1_end1: 1376 pmulhrsw m1, m7 1377 mova [rsp+gprsize+16*1], m1 1378 mova m1, m6 1379 mova m6, m2 1380 pmulhrsw m2, m5, m7 1381 mova m5, m6 1382 mova m6, m4 1383 pmulhrsw m4, m3, m7 1384 mova m3, m6 1385 mova m6, m0 1386 mova m0, m7 1387 pxor m7, m7 1388 psubw m7, m0 1389 pmulhrsw m0, [rsp+gprsize+16*0] 1390 REPX {pmulhrsw x, m7}, m1, m3, m5 1391 pmulhrsw m7, m6 1392 jmp m(idct_8x8_internal_8bpc).pass1_end3 1393 1394ALIGN function_align 1395.pass2: 1396 lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] 1397 1398.pass2_main: 1399 call m(iadst_8x8_internal_8bpc).main 1400 call m(iadst_8x8_internal_8bpc).main_pass2_end 1401 1402.end: 1403 mova m7, [o(pw_2048)] 1404 REPX {pmulhrsw x, m7}, m0, m2, m4, m6 1405 mova [rsp+gprsize+16*2], m2 1406 mova m2, m0 1407 pxor m0, m0 1408 psubw m0, m7 1409 mova m7, m2 1410 pmulhrsw m1, m0 1411 pmulhrsw m2, m5, m0 1412 mova [rsp+gprsize+16*1], m1 1413 mova m5, m4 1414 mova m1, m6 1415 pmulhrsw m4, m3, m0 1416 pmulhrsw m0, [rsp+gprsize+16*0] 1417 mova m3, m5 1418 mova [rsp+gprsize+16*0], m7 1419 jmp m(idct_8x8_internal_8bpc).end3 1420 1421INV_TXFM_8X8_FN identity, dct 1422INV_TXFM_8X8_FN identity, adst 1423INV_TXFM_8X8_FN identity, flipadst 1424INV_TXFM_8X8_FN identity, identity 1425 1426cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1427 LOAD_8ROWS coeffq, 16 1428 mova [rsp+gprsize+16*1], m6 1429 jmp m(idct_8x8_internal_8bpc).pass1_end3 1430 1431ALIGN function_align 1432.pass2: 1433 lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] 1434 1435.end: 1436 pmulhrsw m7, [o(pw_4096)] 1437 mova [rsp+gprsize+16*0], m7 1438 mova m7, [o(pw_4096)] 1439 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1440 mova [rsp+gprsize+16*2], m5 1441 mova [rsp+gprsize+16*1], m6 1442 jmp m(idct_8x8_internal_8bpc).end3 1443 1444 1445%macro INV_TXFM_4X16_FN 2 ; type1, type2 1446 INV_TXFM_FN %1, %2, 4x16, 8 1447%ifidn %1_%2, dct_dct 1448 pshuflw m0, [coeffq], q0000 1449 punpcklwd m0, m0 1450 mova m1, [o(pw_2896x8)] 1451 pmulhrsw m0, m1 1452 mov [coeffq], eobd 1453 pmulhrsw m0, [o(pw_16384)] 1454 pmulhrsw m0, m1 1455 pmulhrsw m0, [o(pw_2048)] 1456.end: 1457 WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 1458 lea dstq, [dstq+strideq*4] 1459 WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 1460 lea dstq, [dstq+strideq*4] 1461 WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 1462 lea dstq, [dstq+strideq*4] 1463 WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 1464 RET 1465%endif 1466%endmacro 1467 1468INV_TXFM_4X16_FN dct, dct 1469INV_TXFM_4X16_FN dct, adst 1470INV_TXFM_4X16_FN dct, flipadst 1471INV_TXFM_4X16_FN dct, identity 1472 1473cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1474 lea r3, [o(m(idct_4x8_internal_8bpc).pass1)] 1475 1476.pass1: 1477 mova m0, [coeffq+16*1] 1478 mova m1, [coeffq+16*3] 1479 mova m2, [coeffq+16*5] 1480 mova m3, [coeffq+16*7] 1481 push tx2q 1482 lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)] 1483 jmp r3 1484 1485.pass1_2: 1486 mova [coeffq+16*1], m0 1487 mova [coeffq+16*3], m1 1488 mova [coeffq+16*5], m2 1489 mova [coeffq+16*7], m3 1490 mova m0, [coeffq+16*0] 1491 mova m1, [coeffq+16*2] 1492 mova m2, [coeffq+16*4] 1493 mova m3, [coeffq+16*6] 1494 lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)] 1495 jmp r3 1496 1497.pass1_end: 1498 pop tx2q 1499 1500 mova m4, [coeffq+16*1] 1501 mova m5, [coeffq+16*3] 1502 mova m6, [coeffq+16*5] 1503 mova m7, [o(pw_16384)] 1504 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1505 1506 pmulhrsw m7, [coeffq+16*7] 1507 mova [coeffq+16*7], m7 1508 jmp tx2q 1509 1510.pass2: 1511 call m(idct_16x4_internal_8bpc).main 1512 1513.end: 1514 mova m7, [o(pw_2048)] 1515 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1516 pmulhrsw m7, [coeffq+16*7] 1517 mova [coeffq+16*4], m4 1518 1519.end1: 1520 mova [coeffq+16*5], m5 1521 mova [coeffq+16*6], m6 1522 mov r3, coeffq 1523 WRITE_4X8 0, 1, 3, 2 1524 1525 mova m0, [r3+16*4] 1526 mova m1, [r3+16*5] 1527 mova m2, [r3+16*6] 1528 mova m3, m7 1529 lea dstq, [dstq+strideq*4] 1530 WRITE_4X8 0, 1, 3, 2 1531 1532.end2: 1533 pxor m7, m7 1534 REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 1535 ret 1536 1537INV_TXFM_4X16_FN adst, dct 1538INV_TXFM_4X16_FN adst, adst 1539INV_TXFM_4X16_FN adst, flipadst 1540INV_TXFM_4X16_FN adst, identity 1541 1542cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1543 lea r3, [o(m(iadst_4x8_internal_8bpc).pass1)] 1544 jmp m(idct_4x16_internal_8bpc).pass1 1545 1546.pass2: 1547 call m(iadst_16x4_internal_8bpc).main 1548 call m(iadst_16x4_internal_8bpc).main_pass2_end 1549 1550 punpcklqdq m6, m5, m4 ;low: -out5 high: -out7 1551 punpckhqdq m4, m5 ;low: out8 high: out10 1552 punpcklqdq m5, m7, m2 ;low: out4 high: out6 1553 punpckhqdq m2, m7 ;low: -out9 high: -out11 1554 mova [coeffq+16*4], m2 1555 mova [coeffq+16*5], m6 1556 mova m2, [coeffq+16*6] 1557 mova m6, [coeffq+16*7] 1558 punpckhqdq m1, m6, m0 ;low: -out13 high: -out15 1559 punpcklqdq m0, m6 ;low: out0 high: out2 1560 punpckhqdq m6, m3, m2 ;low: out12 high: out14 1561 punpcklqdq m2, m3 ;low: -out1 high: -out3 1562 1563 mova m7, [o(pw_2048)] 1564 1565.end1: 1566 REPX {pmulhrsw x, m7}, m0, m5, m4, m6 1567 pxor m3, m3 1568 psubw m3, m7 1569 mova m7, [coeffq+16*4] 1570 REPX {pmulhrsw x, m3}, m2, m7, m1 1571 pmulhrsw m3, [coeffq+16*5] 1572 mova [coeffq+16*7], m5 1573 1574 punpckhqdq m5, m4, m7 ;low: out10 high: out11 1575 punpcklqdq m4, m7 ;low: out8 high: out9 1576 punpckhqdq m7, m6, m1 ;low: out14 high: out15 1577 punpcklqdq m6, m1 ;low: out12 high: out13 1578 punpckhqdq m1, m0, m2 ;low: out2 high: out3 1579 punpcklqdq m0, m2 ;low: out0 high: out1 1580 mova [coeffq+16*4], m4 1581 mova m4, [coeffq+16*7] 1582 punpcklqdq m2, m4, m3 ;low: out4 high: out5 1583 punpckhqdq m4, m3 ;low: out6 high: out7 1584 mova m3, m4 1585 1586.end2: 1587 mova [coeffq+16*5], m5 1588 mova [coeffq+16*6], m6 1589 mov r3, coeffq 1590 WRITE_4X8 0, 1, 2, 3 1591 1592 mova m0, [r3+16*4] 1593 mova m1, [r3+16*5] 1594 mova m2, [r3+16*6] 1595 mova m3, m7 1596 lea dstq, [dstq+strideq*4] 1597 WRITE_4X8 0, 1, 2, 3 1598 1599.end3: 1600 pxor m7, m7 1601 REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 1602 ret 1603 1604 1605INV_TXFM_4X16_FN flipadst, dct 1606INV_TXFM_4X16_FN flipadst, adst 1607INV_TXFM_4X16_FN flipadst, flipadst 1608INV_TXFM_4X16_FN flipadst, identity 1609 1610cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1611 lea r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)] 1612 jmp m(idct_4x16_internal_8bpc).pass1 1613 1614.pass2: 1615 call m(iadst_16x4_internal_8bpc).main 1616 call m(iadst_16x4_internal_8bpc).main_pass2_end 1617 1618 punpckhqdq m6, m5, m4 ;low: out5 high: out7 1619 punpcklqdq m4, m5 ;low: -out8 high: -out10 1620 punpckhqdq m5, m7, m2 ;low: -out4 high: -out6 1621 punpcklqdq m2, m7 ;low: out9 high: out11 1622 mova [coeffq+16*4], m2 1623 mova [coeffq+16*5], m6 1624 mova m2, [coeffq+16*6] 1625 mova m6, [coeffq+16*7] 1626 punpcklqdq m1, m6, m0 ;low: out13 high: out15 1627 punpckhqdq m0, m6 ;low: -out0 high: -out2 1628 punpcklqdq m6, m3, m2 ;low: -out12 high: -out14 1629 punpckhqdq m2, m3 ;low: out1 high: out3 1630 1631 mova m7, [o(pw_m2048)] 1632 jmp m(iadst_4x16_internal_8bpc).end1 1633 1634 1635INV_TXFM_4X16_FN identity, dct 1636INV_TXFM_4X16_FN identity, adst 1637INV_TXFM_4X16_FN identity, flipadst 1638INV_TXFM_4X16_FN identity, identity 1639 1640%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] 1641 pmulhrsw m%2, m%3, m%1 1642%if %0 == 4 ; if downshifting by 1 1643 pmulhrsw m%2, m%4 1644%else 1645 paddsw m%1, m%1 1646%endif 1647 paddsw m%1, m%2 1648%endmacro 1649 1650cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1651 mova m0, [coeffq+16*1] 1652 mova m6, [o(pw_1697x8)] 1653 mova m1, [coeffq+16*3] 1654 mova m2, [coeffq+16*5] 1655 mova m3, [coeffq+16*7] 1656 pcmpeqw m7, m7 1657 mov r3, tx2q 1658 lea tx2q, [o(.pass1_2)] 1659.pass1: 1660 pmulhrsw m4, m6, m0 1661 pmulhrsw m5, m6, m1 1662 pavgw m4, m0 1663 pcmpeqw m0, m7 1664 pavgw m5, m1 1665 pcmpeqw m1, m7 1666 pandn m0, m4 1667 pmulhrsw m4, m6, m2 1668 pandn m1, m5 1669 pmulhrsw m5, m6, m3 1670 pavgw m4, m2 1671 pcmpeqw m2, m7 1672 pavgw m5, m3 1673 pcmpeqw m3, m7 1674 pandn m2, m4 1675 pandn m3, m5 1676 jmp m(iadst_4x8_internal_8bpc).pass1_end 1677.pass1_2: 1678 mova [coeffq+16*1], m0 1679 mova [coeffq+16*3], m1 1680 mova [coeffq+16*5], m2 1681 mova [coeffq+16*7], m3 1682 mova m0, [coeffq+16*0] 1683 mova m1, [coeffq+16*2] 1684 mova m2, [coeffq+16*4] 1685 mova m3, [coeffq+16*6] 1686 lea tx2q, [o(.pass1_end)] 1687 jmp .pass1 1688.pass1_end: 1689 mova m4, [coeffq+16*1] 1690 mova m5, [coeffq+16*3] 1691 mova m6, [coeffq+16*5] 1692 jmp r3 1693.pass2: 1694 mova m7, [o(pw_1697x16)] 1695 mova [coeffq+16*6], m6 1696 REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 1697 mova m6, [coeffq+16*7] 1698 IDTX16 6, 7, 7 1699 mova [coeffq+16*7], m6 1700 mova m6, [coeffq+16*6] 1701 pmulhrsw m7, m6, [o(pw_1697x16)] 1702 paddsw m6, m6 1703 paddsw m6, m7 1704 mova m7, [o(pw_2048)] 1705 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1706 pmulhrsw m7, [coeffq+16*7] 1707 mova [coeffq+16*4], m4 1708 jmp m(iadst_4x16_internal_8bpc).end2 1709 1710 1711%macro INV_TXFM_16X4_FN 2 ; type1, type2 1712 INV_TXFM_FN %1, %2, 16x4, 8 1713%ifidn %1_%2, dct_dct 1714 movd m1, [o(pw_2896x8)] 1715 pmulhrsw m0, m1, [coeffq] 1716 movd m2, [o(pw_16384)] 1717 mov [coeffq], eobd 1718 mov r2d, 2 1719 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)] 1720.dconly: 1721 pmulhrsw m0, m2 1722 movd m2, [o(pw_2048)] ;intentionally rip-relative 1723 pmulhrsw m0, m1 1724 pmulhrsw m0, m2 1725 pshuflw m0, m0, q0000 1726 punpcklwd m0, m0 1727 pxor m5, m5 1728.dconly_loop: 1729 mova m1, [dstq] 1730 mova m3, [dstq+strideq] 1731 punpckhbw m2, m1, m5 1732 punpcklbw m1, m5 1733 punpckhbw m4, m3, m5 1734 punpcklbw m3, m5 1735 paddw m2, m0 1736 paddw m1, m0 1737 paddw m4, m0 1738 paddw m3, m0 1739 packuswb m1, m2 1740 packuswb m3, m4 1741 mova [dstq], m1 1742 mova [dstq+strideq], m3 1743 lea dstq, [dstq+strideq*2] 1744 dec r2d 1745 jg .dconly_loop 1746 jmp tx2q 1747.end: 1748 RET 1749%endif 1750%endmacro 1751 1752%macro LOAD_7ROWS 2 ;src, stride 1753 mova m0, [%1+%2*0] 1754 mova m1, [%1+%2*1] 1755 mova m2, [%1+%2*2] 1756 mova m3, [%1+%2*3] 1757 mova m4, [%1+%2*4] 1758 mova m5, [%1+%2*5] 1759 mova m6, [%1+%2*6] 1760%endmacro 1761 1762%macro SAVE_7ROWS 2 ;src, stride 1763 mova [%1+%2*0], m0 1764 mova [%1+%2*1], m1 1765 mova [%1+%2*2], m2 1766 mova [%1+%2*3], m3 1767 mova [%1+%2*4], m4 1768 mova [%1+%2*5], m5 1769 mova [%1+%2*6], m6 1770%endmacro 1771 1772%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3] 1773 punpckhwd m%5, m%4, m%1 ;packed in13 in3 1774 punpcklwd m%1, m%4 ;packed in1 in15 1775 punpcklwd m%4, m%3, m%2 ;packed in9 in7 1776 punpckhwd m%2, m%3 ;packed in5 in11 1777 mova m%7, [o(pd_2048)] 1778 ITX_MUL2X_PACK %1, %6, %7, 401, 4076, 1 ;low: t8a high: t15a 1779 ITX_MUL2X_PACK %4, %6, %7, 3166, 2598, 1 ;low: t9a high: t14a 1780 ITX_MUL2X_PACK %2, %6, %7, 1931, 3612, 1 ;low: t10a high: t13a 1781 ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a 1782 psubsw m%6, m%1, m%4 ;low: t9 high: t14 1783 paddsw m%1, m%4 ;low: t8 high: t15 1784 psubsw m%4, m%5, m%2 ;low: t10 high: t13 1785 paddsw m%5, m%2 ;low: t11 high: t12 1786 mova m%2, [o(deint_shuf2)] 1787 pshufb m%6, m%2 1788 pshufb m%4, m%2 1789 ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a 1790 ITX_MUL2X_PACK %4, %3, %7, m3784, 1567, 1 ;low: t10a high: t13a 1791 psubsw m%3, m%1, m%5 ;low: t11a high: t12a 1792 paddsw m%1, m%5 ;low: t8a high: t15a 1793 psubsw m%5, m%6, m%4 ;low: t10 high: t13 1794 paddsw m%6, m%4 ;low: t9 high: t14 1795 pshufb m%3, m%2 1796 pshufb m%5, m%2 1797 ITX_MUL2X_PACK %3, %2, %7, 2896, 2896, 4 ;t12, t11 1798 ITX_MUL2X_PACK %5, %4, %7, 2896, 2896, 4 ;t13a, t10a 1799 packssdw m%2, m%4 ;low: t11 high: t10a 1800 packssdw m%3, m%5 ;low: t12 high: t13a 1801 punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14 1802 punpcklqdq m%1, m%6 ;low: t8a high: t9 1803%endmacro 1804 1805INV_TXFM_16X4_FN dct, dct 1806INV_TXFM_16X4_FN dct, adst 1807INV_TXFM_16X4_FN dct, flipadst 1808INV_TXFM_16X4_FN dct, identity 1809 1810cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1811 LOAD_7ROWS coeffq, 16 1812 call .main 1813 1814.pass1_end: 1815 punpckhwd m7, m0, m2 ;packed out1, out5 1816 punpcklwd m0, m2 ;packed out0, out4 1817 punpcklwd m2, m1, m3 ;packed out3, out7 1818 punpckhwd m1, m3 ;packed out2, out6 1819 mova [coeffq+16*6], m7 1820 mova m7, [coeffq+16*7] 1821 punpckhwd m3, m4, m6 ;packed out9, out13 1822 punpcklwd m4, m6 ;packed out8, out12 1823 punpcklwd m6, m5, m7 ;packed out11, out15 1824 punpckhwd m5, m7 ;packed out10, out14 1825 1826.pass1_end2: 1827 mova m7, [o(pw_16384)] 1828 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 1829 pmulhrsw m7, [coeffq+16*6] 1830 mova [coeffq+16*6], m7 1831 1832.pass1_end3: 1833 punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high 1834 punpcklwd m3, m6 ;packed 9, 10, 13, 15 low 1835 punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high 1836 punpcklwd m4, m5 ;packed 8, 10, 12, 14 low 1837 punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1) 1838 punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0) 1839 punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3) 1840 punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2) 1841 mova [coeffq+16*7], m3 1842 mova m3, [coeffq+16*6] 1843 punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high 1844 punpcklwd m3, m2 ;packed 1, 3, 5, 7 low 1845 punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high 1846 punpcklwd m0, m1 ;packed 0, 2, 4, 6 low 1847 punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1) 1848 punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0) 1849 punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3) 1850 punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2) 1851 jmp tx2q 1852 1853.pass2: 1854 lea tx2q, [o(m(idct_8x4_internal_8bpc).pass2)] 1855 1856.pass2_end: 1857 mova [coeffq+16*4], m4 1858 mova [coeffq+16*5], m5 1859 mova [coeffq+16*6], m6 1860 lea r3, [dstq+8] 1861 call tx2q 1862 1863 add coeffq, 16*4 1864 mova m0, [coeffq+16*0] 1865 mova m1, [coeffq+16*1] 1866 mova m2, [coeffq+16*2] 1867 mova m3, [coeffq+16*3] 1868 mov dstq, r3 1869 jmp tx2q 1870 1871ALIGN function_align 1872cglobal_label .main 1873 punpckhqdq m7, m0, m1 ;low:in1 high:in3 1874 punpcklqdq m0, m1 1875 punpcklqdq m1, m2, m3 1876 punpckhqdq m3, m2 ;low:in7 high:in5 1877 mova [coeffq+16*4], m7 1878 mova [coeffq+16*5], m3 1879 mova m7, [coeffq+16*7] 1880 punpcklqdq m2, m4, m5 1881 punpckhqdq m4, m5 ;low:in9 high:in11 1882 punpcklqdq m3, m6, m7 1883 punpckhqdq m7, m6 ;low:in15 high:in13 1884 mova [coeffq+16*6], m4 1885 IDCT8_1D_PACKED 1886 mova m6, [coeffq+16*4] 1887 mova m4, [coeffq+16*5] 1888 mova m5, [coeffq+16*6] 1889 mova [coeffq+16*4], m1 1890 mova [coeffq+16*5], m2 1891 mova [coeffq+16*6], m3 1892 1893 IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3 1894 1895 mova m1, [coeffq+16*4] 1896 psubsw m3, m0, m7 ;low:out15 high:out14 1897 paddsw m0, m7 ;low:out0 high:out1 1898 psubsw m7, m1, m5 ;low:out12 high:out13 1899 paddsw m1, m5 ;low:out3 high:out2 1900 mova [coeffq+16*7], m3 1901 mova m2, [coeffq+16*5] 1902 mova m3, [coeffq+16*6] 1903 psubsw m5, m2, m4 ;low:out11 high:out10 1904 paddsw m2, m4 ;low:out4 high:out5 1905 psubsw m4, m3, m6 ;low:out8 high:out9 1906 paddsw m3, m6 ;low:out7 high:out6 1907 mova m6, m7 1908 ret 1909 1910INV_TXFM_16X4_FN adst, dct 1911INV_TXFM_16X4_FN adst, adst 1912INV_TXFM_16X4_FN adst, flipadst 1913INV_TXFM_16X4_FN adst, identity 1914 1915cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 1916 LOAD_7ROWS coeffq, 16 1917 call .main 1918 call .main_pass1_end 1919 1920 punpckhwd m6, m7, m0 ;packed -out11, -out15 1921 punpcklwd m0, m7 ;packed out0, out4 1922 punpcklwd m7, m3, m4 ;packed -out3, -out7 1923 punpckhwd m4, m3 ;packed out8, out12 1924 mova m1, [coeffq+16*6] 1925 punpcklwd m3, m1, m5 ;packed -out1, -out5 1926 punpckhwd m5, m1 ;packed out10, out14 1927 mova m1, [coeffq+16*7] 1928 mova [coeffq+16*6], m3 1929 mova [coeffq+16*7], m7 1930 punpckhwd m3, m2, m1 ;packed -out9, -out13 1931 punpcklwd m1, m2 ;packed out2, out6 1932 1933 mova m7, [o(pw_16384)] 1934 1935.pass1_end: 1936 REPX {pmulhrsw x, m7}, m0, m1, m4, m5 1937 pxor m2, m2 1938 psubw m2, m7 1939 mova m7, [coeffq+16*6] 1940 REPX {pmulhrsw x, m2}, m7, m3, m6 1941 pmulhrsw m2, [coeffq+16*7] 1942 mova [coeffq+16*6], m7 1943 jmp m(idct_16x4_internal_8bpc).pass1_end3 1944 1945.pass2: 1946 lea tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)] 1947 jmp m(idct_16x4_internal_8bpc).pass2_end 1948 1949ALIGN function_align 1950cglobal_label .main 1951 mova [coeffq+16*6], m0 1952 pshufd m0, m1, q1032 1953 pshufd m2, m2, q1032 1954 punpckhwd m1, m6, m0 ;packed in13, in2 1955 punpcklwd m0, m6 ;packed in3, in12 1956 punpckhwd m7, m5, m2 ;packed in11, in4 1957 punpcklwd m2, m5 ;packed in5, in10 1958 mova m6, [o(pd_2048)] 1959 ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3 1960 ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5 1961 ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11 1962 ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13 1963 psubsw m5, m1, m2 ;low:t10a high:t11a 1964 paddsw m1, m2 ;low:t2a high:t3a 1965 psubsw m2, m7, m0 ;low:t12a high:t13a 1966 paddsw m7, m0 ;low:t4a high:t5a 1967 punpcklqdq m0, m5 1968 punpckhwd m0, m5 ;packed t10a, t11a 1969 punpcklqdq m5, m2 1970 punpckhwd m2, m5 ;packed t13a, t12a 1971 ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11 1972 ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13 1973 mova [coeffq+16*4], m1 1974 mova [coeffq+16*5], m7 1975 mova m1, [coeffq+16*6] 1976 mova m7, [coeffq+16*7] 1977 pshufd m1, m1, q1032 1978 pshufd m3, m3, q1032 1979 punpckhwd m5, m7, m1 ;packed in15, in0 1980 punpcklwd m1, m7 ;packed in1, in14 1981 punpckhwd m7, m4, m3 ;packed in9, in6 1982 punpcklwd m3, m4 ;packed in7, in8 1983 ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1 1984 ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7 1985 ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9 1986 ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15 1987 psubsw m4, m5, m3 ;low:t8a high:t9a 1988 paddsw m5, m3 ;low:t0a high:t1a 1989 psubsw m3, m7, m1 ;low:t14a high:t15a 1990 paddsw m7, m1 ;low:t6a high:t7a 1991 punpcklqdq m1, m4 1992 punpckhwd m1, m4 ;packed t8a, t9a 1993 punpcklqdq m4, m3 1994 punpckhwd m3, m4 ;packed t15a, t14a 1995 ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9 1996 ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15 1997 paddsw m4, m1, m2 ;low:t12a high:t13a 1998 psubsw m1, m2 ;low:t8a high:t9a 1999 psubsw m2, m0, m3 ;low:t14a high:t15a 2000 paddsw m0, m3 ;low:t10a high:t11a 2001 punpcklqdq m3, m1 2002 punpckhwd m3, m1 ;packed t12a, t13a 2003 punpcklqdq m1, m2 2004 punpckhwd m2, m1 ;packed t15a, t14a 2005 ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13 2006 ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15 2007 psubsw m1, m3, m2 ;low:t14a high:t15a 2008 paddsw m3, m2 ;low:out2 high:-out13 2009 psubsw m2, m4, m0 ;low:t10 high:t11 2010 paddsw m0, m4 ;low:-out1 high:out14 2011 mova [coeffq+16*6], m0 2012 mova [coeffq+16*7], m3 2013 mova m0, [coeffq+16*4] 2014 mova m3, [coeffq+16*5] 2015 psubsw m4, m5, m3 ;low:t4 high:t5 2016 paddsw m5, m3 ;low:t0 high:t1 2017 psubsw m3, m0, m7 ;low:t6 high:t7 2018 paddsw m0, m7 ;low:t2 high:t3 2019 punpcklqdq m7, m4 2020 punpckhwd m7, m4 ;packed t4, t5 2021 punpcklqdq m4, m3 2022 punpckhwd m3, m4 ;packed t7, t6 2023 ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a 2024 ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a 2025 psubsw m4, m5, m0 ;low:t2a high:t3a 2026 paddsw m0, m5 ;low:out0 high:-out15 2027 psubsw m5, m7, m3 ;low:t6 high:t7 2028 paddsw m3, m7 ;low:-out3 high:out12 2029 ret 2030ALIGN function_align 2031.main_pass1_end: 2032 mova m7, [o(deint_shuf1)] 2033 mova [coeffq+16*4], m0 2034 mova [coeffq+16*5], m3 2035 mova m0, [o(pw_2896_m2896)] 2036 mova m3, [o(pw_2896_2896)] 2037 pshufb m1, m7 ;t14a t15a 2038 pshufb m2, m7 ;t10 t11 2039 pshufb m4, m7 ;t2a t3a 2040 pshufb m5, m7 ;t6 t7 2041 pmaddwd m7, m0, m2 2042 pmaddwd m2, m3 2043 paddd m7, m6 2044 paddd m2, m6 2045 psrad m7, 12 2046 psrad m2, 12 2047 packssdw m2, m7 ;low:out6 high:-out9 2048 pmaddwd m7, m0, m4 2049 pmaddwd m4, m3 2050 paddd m7, m6 2051 paddd m4, m6 2052 psrad m7, 12 2053 psrad m4, 12 2054 packssdw m4, m7 ;low:-out7 high:out8 2055 pmaddwd m7, m3, m5 2056 pmaddwd m5, m0 2057 paddd m7, m6 2058 paddd m5, m6 2059 psrad m7, 12 2060 psrad m5, 12 2061 packssdw m7, m5 ;low:out4 high:-out11 2062 pmaddwd m5, m3, m1 2063 pmaddwd m1, m0 2064 paddd m5, m6 2065 paddd m1, m6 2066 psrad m5, 12 2067 psrad m1, 12 2068 packssdw m5, m1 ;low:-out5 high:out10 2069 mova m0, [coeffq+16*4] 2070 mova m3, [coeffq+16*5] 2071 ret 2072ALIGN function_align 2073cglobal_label .main_pass2_end 2074 mova m7, [o(pw_2896x8)] 2075 punpckhqdq m6, m2, m1 ;low:t11 high:t15a 2076 punpcklqdq m2, m1 ;low:t10 high:t14a 2077 psubsw m1, m2, m6 2078 paddsw m2, m6 2079 punpckhqdq m6, m4, m5 ;low:t3a high:t7 2080 punpcklqdq m4, m5 ;low:t2a high:t6 2081 psubsw m5, m4, m6 2082 paddsw m4, m6 2083 pmulhrsw m1, m7 ;low:-out9 high:out10 2084 pmulhrsw m2, m7 ;low:out6 high:-out5 2085 pmulhrsw m5, m7 ;low:out8 high:-out11 2086 pmulhrsw m4, m7 ;low:-out7 high:out4 2087 punpckhqdq m7, m4, m5 ;low:out4 high:-out11 2088 punpcklqdq m4, m5 ;low:-out7 high:out8 2089 punpckhqdq m5, m2, m1 ;low:-out5 high:out10 2090 punpcklqdq m2, m1 ;low:out6 high:-out9 2091 ret 2092 2093 2094INV_TXFM_16X4_FN flipadst, dct 2095INV_TXFM_16X4_FN flipadst, adst 2096INV_TXFM_16X4_FN flipadst, flipadst 2097INV_TXFM_16X4_FN flipadst, identity 2098 2099cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2100 LOAD_7ROWS coeffq, 16 2101 call m(iadst_16x4_internal_8bpc).main 2102 call m(iadst_16x4_internal_8bpc).main_pass1_end 2103 2104 punpcklwd m6, m7, m0 ;packed out11, out15 2105 punpckhwd m0, m7 ;packed -out0, -out4 2106 punpckhwd m7, m3, m4 ;packed out3, out7 2107 punpcklwd m4, m3 ;packed -out8, -out12 2108 mova m1, [coeffq+16*6] 2109 punpckhwd m3, m1, m5 ;packed out1, out5 2110 punpcklwd m5, m1 ;packed -out10, -out14 2111 mova m1, [coeffq+16*7] 2112 mova [coeffq+16*6], m3 2113 mova [coeffq+16*7], m7 2114 punpcklwd m3, m2, m1 ;packed out9, out13 2115 punpckhwd m1, m2 ;packed -out2, -out6 2116 2117 mova m7, [o(pw_m16384)] 2118 jmp m(iadst_16x4_internal_8bpc).pass1_end 2119 2120.pass2: 2121 lea tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)] 2122 jmp m(idct_16x4_internal_8bpc).pass2_end 2123 2124 2125INV_TXFM_16X4_FN identity, dct 2126INV_TXFM_16X4_FN identity, adst 2127INV_TXFM_16X4_FN identity, flipadst 2128INV_TXFM_16X4_FN identity, identity 2129 2130cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2131 mova m1, [coeffq+16*6] 2132 mova m0, [coeffq+16*5] 2133 mova m2, [coeffq+16*7] 2134 mova m6, [o(pw_1697x16)] 2135 mova m7, [o(pw_16384)] 2136 pmulhrsw m4, m6, m1 2137 pmulhrsw m3, m6, m0 2138 pmulhrsw m5, m6, m2 2139 pmulhrsw m4, m7 2140 pmulhrsw m3, m7 2141 pmulhrsw m5, m7 2142 paddsw m1, m4 2143 paddsw m0, m3 2144 paddsw m5, m2 2145 mova m2, [coeffq+16*2] 2146 mova m3, [coeffq+16*3] 2147 mova m4, [coeffq+16*4] 2148 mova [coeffq+16*6], m1 2149 mova [coeffq+16*5], m0 2150 mova [coeffq+16*7], m5 2151 pmulhrsw m0, m6, m2 2152 pmulhrsw m1, m6, m3 2153 pmulhrsw m5, m6, m4 2154 pmulhrsw m0, m7 2155 pmulhrsw m1, m7 2156 pmulhrsw m5, m7 2157 paddsw m2, m0 2158 paddsw m3, m1 2159 paddsw m4, m5 2160 mova m0, [coeffq+16*0] 2161 mova m1, [coeffq+16*1] 2162 pmulhrsw m5, m6, m0 2163 pmulhrsw m6, m1 2164 pmulhrsw m5, m7 2165 pmulhrsw m6, m7 2166 paddsw m0, m5 2167 paddsw m1, m6 2168 mova m6, [coeffq+16*6] 2169 mova m5, [coeffq+16*5] 2170 punpckhwd m7, m0, m2 ;packed out1, out5 2171 punpcklwd m0, m2 ;packed out0, out4 2172 punpckhwd m2, m1, m3 ;packed out3, out7 2173 punpcklwd m1, m3 ;packed out2, out6 2174 mova [coeffq+16*6], m7 2175 mova m7, [coeffq+16*7] 2176 punpckhwd m3, m4, m6 ;packed out9, out13 2177 punpcklwd m4, m6 ;packed out8, out12 2178 punpckhwd m6, m5, m7 ;packed out11, out15 2179 punpcklwd m5, m7 ;packed out10, out14 2180 jmp m(idct_16x4_internal_8bpc).pass1_end3 2181 2182.pass2: 2183 lea tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)] 2184 jmp m(idct_16x4_internal_8bpc).pass2_end 2185 2186 2187%macro SAVE_8ROWS 2 ;src, stride 2188 mova [%1+%2*0], m0 2189 mova [%1+%2*1], m1 2190 mova [%1+%2*2], m2 2191 mova [%1+%2*3], m3 2192 mova [%1+%2*4], m4 2193 mova [%1+%2*5], m5 2194 mova [%1+%2*6], m6 2195 mova [%1+%2*7], m7 2196%endmacro 2197 2198%macro INV_TXFM_8X16_FN 2 ; type1, type2 2199 INV_TXFM_FN %1, %2, 8x16, 8, 16*16 2200%ifidn %1_%2, dct_dct 2201 pshuflw m0, [coeffq], q0000 2202 punpcklwd m0, m0 2203 mova m1, [o(pw_2896x8)] 2204 pmulhrsw m0, m1 2205 mova m2, [o(pw_16384)] 2206 mov [coeffq], eobd 2207 pmulhrsw m0, m1 2208 pmulhrsw m0, m2 2209 psrlw m2, 3 ; pw_2048 2210 pmulhrsw m0, m1 2211 pmulhrsw m0, m2 2212 mov r3d, 4 2213 lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)] 2214 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop 2215.end: 2216 RET 2217%endif 2218%endmacro 2219 2220INV_TXFM_8X16_FN dct, dct 2221INV_TXFM_8X16_FN dct, adst 2222INV_TXFM_8X16_FN dct, flipadst 2223INV_TXFM_8X16_FN dct, identity 2224 2225cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2226 lea r3, [o(m(idct_8x8_internal_8bpc).pass1)] 2227 2228.pass1: 2229 LOAD_8ROWS coeffq+16*1, 32, 1 2230 mov [rsp+gprsize+16*11], tx2q 2231 lea tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)] 2232 jmp r3 2233 2234.pass1_end: 2235 SAVE_8ROWS coeffq+16*1, 32 2236 LOAD_8ROWS coeffq+16*0, 32, 1 2237 mov tx2q, [rsp+gprsize+16*11] 2238 jmp r3 2239 2240.pass2: 2241 lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] 2242 2243.pass2_pre: 2244 mova [coeffq+16*2 ], m1 2245 mova [coeffq+16*6 ], m3 2246 mova [coeffq+16*10], m5 2247 mova [coeffq+16*14], m7 2248 mova m1, m2 2249 mova m2, m4 2250 mova m3, m6 2251 mova m4, [coeffq+16*1 ] 2252 mova m5, [coeffq+16*5 ] 2253 mova m6, [coeffq+16*9 ] 2254 mova m7, [coeffq+16*13] 2255 2256.pass2_main: 2257 call m(idct_8x8_internal_8bpc).main 2258 2259 SAVE_7ROWS rsp+gprsize+16*3, 16 2260 mova m0, [coeffq+16*2 ] 2261 mova m1, [coeffq+16*6 ] 2262 mova m2, [coeffq+16*10] 2263 mova m3, [coeffq+16*14] 2264 mova m4, [coeffq+16*3 ] 2265 mova m5, [coeffq+16*7 ] 2266 mova m6, [coeffq+16*11] 2267 mova m7, [coeffq+16*15] 2268 call m(idct_16x8_internal_8bpc).main 2269 2270 mov r3, dstq 2271 lea dstq, [dstq+strideq*8] 2272 jmp m(idct_8x8_internal_8bpc).end 2273 2274.end: 2275 LOAD_8ROWS rsp+gprsize+16*3, 16 2276 mova [rsp+gprsize+16*0], m7 2277 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 2278 mov dstq, r3 2279 jmp m(idct_8x8_internal_8bpc).end 2280 2281.end1: 2282 pxor m7, m7 2283 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 2284 ret 2285 2286INV_TXFM_8X16_FN adst, dct 2287INV_TXFM_8X16_FN adst, adst 2288INV_TXFM_8X16_FN adst, flipadst 2289INV_TXFM_8X16_FN adst, identity 2290 2291cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2292 lea r3, [o(m(iadst_8x8_internal_8bpc).pass1)] 2293 jmp m(idct_8x16_internal_8bpc).pass1 2294 2295.pass2: 2296 lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] 2297 2298.pass2_pre: 2299 mova [rsp+gprsize+16*7], m0 2300 mova [rsp+gprsize+16*8], m1 2301 mova [rsp+gprsize+16*5], m6 2302 mova [rsp+gprsize+16*6], m7 2303 mova m0, m2 2304 mova m1, m3 2305 mova m2, m4 2306 mova m3, m5 2307 2308.pass2_main: 2309 mova m4, [coeffq+16*1 ] 2310 mova m5, [coeffq+16*3 ] 2311 mova m6, [coeffq+16*13] 2312 mova m7, [coeffq+16*15] 2313 mova [rsp+gprsize+16*3], m4 2314 mova [rsp+gprsize+16*4], m5 2315 mova [rsp+gprsize+16*9], m6 2316 mova [rsp+gprsize+32*5], m7 2317 mova m4, [coeffq+16*5 ] 2318 mova m5, [coeffq+16*7 ] 2319 mova m6, [coeffq+16*9 ] 2320 mova m7, [coeffq+16*11] 2321 2322 call m(iadst_16x8_internal_8bpc).main 2323 call m(iadst_16x8_internal_8bpc).main_pass2_end 2324 2325 mov r3, dstq 2326 lea dstq, [dstq+strideq*8] 2327 jmp m(iadst_8x8_internal_8bpc).end 2328 2329.end: 2330 LOAD_8ROWS rsp+gprsize+16*3, 16 2331 mova [rsp+gprsize+16*0], m7 2332 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 2333 mov dstq, r3 2334 jmp m(iadst_8x8_internal_8bpc).end 2335 2336 2337INV_TXFM_8X16_FN flipadst, dct 2338INV_TXFM_8X16_FN flipadst, adst 2339INV_TXFM_8X16_FN flipadst, flipadst 2340INV_TXFM_8X16_FN flipadst, identity 2341 2342cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2343 lea r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)] 2344 jmp m(idct_8x16_internal_8bpc).pass1 2345 2346.pass2: 2347 lea tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)] 2348 lea r3, [dstq+strideq*8] 2349 2350.pass2_pre: 2351 mova [rsp+gprsize+16*7], m0 2352 mova [rsp+gprsize+16*8], m1 2353 mova [rsp+gprsize+16*5], m6 2354 mova [rsp+gprsize+16*6], m7 2355 mova m0, m2 2356 mova m1, m3 2357 mova m2, m4 2358 mova m3, m5 2359 2360.pass2_main: 2361 mova m4, [coeffq+16*1 ] 2362 mova m5, [coeffq+16*3 ] 2363 mova m6, [coeffq+16*13] 2364 mova m7, [coeffq+16*15] 2365 mova [rsp+gprsize+16*3], m4 2366 mova [rsp+gprsize+16*4], m5 2367 mova [rsp+gprsize+16*9], m6 2368 mova [rsp+gprsize+32*5], m7 2369 mova m4, [coeffq+16*5 ] 2370 mova m5, [coeffq+16*7 ] 2371 mova m6, [coeffq+16*9 ] 2372 mova m7, [coeffq+16*11] 2373 2374 call m(iadst_16x8_internal_8bpc).main 2375 call m(iadst_16x8_internal_8bpc).main_pass2_end 2376 jmp m(iflipadst_8x8_internal_8bpc).end 2377 2378.end: 2379 LOAD_8ROWS rsp+gprsize+16*3, 16 2380 mova [rsp+gprsize+16*0], m7 2381 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 2382 mov dstq, r3 2383 jmp m(iflipadst_8x8_internal_8bpc).end 2384 2385 2386INV_TXFM_8X16_FN identity, dct 2387INV_TXFM_8X16_FN identity, adst 2388INV_TXFM_8X16_FN identity, flipadst 2389INV_TXFM_8X16_FN identity, identity 2390 2391cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2392 LOAD_8ROWS coeffq+16*1, 32, 1 2393 mov r3, tx2q 2394 lea tx2q, [o(m(iidentity_8x16_internal_8bpc).pass1_end)] 2395 mova [rsp+gprsize+16*1], m6 2396 jmp m(idct_8x8_internal_8bpc).pass1_end3 2397 2398.pass1_end: 2399 SAVE_8ROWS coeffq+16*1, 32 2400 LOAD_8ROWS coeffq+16*0, 32, 1 2401 mov tx2q, r3 2402 mova [rsp+gprsize+16*1], m6 2403 jmp m(idct_8x8_internal_8bpc).pass1_end3 2404 2405.pass2: 2406 lea tx2q, [o(m(iidentity_8x16_internal_8bpc).end1)] 2407 2408.end: 2409 mova [rsp+gprsize+16*0], m7 2410 mova [rsp+gprsize+16*1], m6 2411 mova m7, [o(pw_1697x16)] 2412 REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 2413 mova m6, [rsp+gprsize+16*1] 2414 mova [rsp+gprsize+16*2], m5 2415 IDTX16 6, 5, 7 2416 mova m5, [rsp+gprsize+16*0] 2417 IDTX16 5, 7, 7 2418 mova m7, [o(pw_2048)] 2419 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 2420 pmulhrsw m7, [rsp+gprsize+16*2] 2421 mova [rsp+gprsize+16*0], m5 2422 mova [rsp+gprsize+16*1], m6 2423 mova [rsp+gprsize+16*2], m7 2424 jmp m(idct_8x8_internal_8bpc).end3 2425 2426.end1: 2427 LOAD_8ROWS coeffq+16*1, 32 2428 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 2429 lea dstq, [dstq+strideq*2] 2430 jmp .end 2431 2432 2433%macro INV_TXFM_16X8_FN 2 ; type1, type2 2434 INV_TXFM_FN %1, %2, 16x8, 8, 16*16 2435%ifidn %1_%2, dct_dct 2436 movd m1, [o(pw_2896x8)] 2437 pmulhrsw m0, m1, [coeffq] 2438 movd m2, [o(pw_16384)] 2439 mov [coeffq], eobd 2440 pmulhrsw m0, m1 2441 mov r2d, 4 2442 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)] 2443 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly 2444.end: 2445 RET 2446%endif 2447%endmacro 2448 2449INV_TXFM_16X8_FN dct, dct 2450INV_TXFM_16X8_FN dct, adst 2451INV_TXFM_16X8_FN dct, flipadst 2452INV_TXFM_16X8_FN dct, identity 2453 2454cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2455 LOAD_8ROWS coeffq+16*0, 32, 1 2456 call m(idct_8x8_internal_8bpc).main 2457 SAVE_7ROWS rsp+gprsize+16*3, 16 2458 2459 LOAD_8ROWS coeffq+16*1, 32, 1 2460 call .main 2461 mov r3, tx2q 2462 lea tx2q, [o(m(idct_16x8_internal_8bpc).pass1_end)] 2463 jmp m(idct_8x8_internal_8bpc).pass1_end 2464 2465.pass1_end: 2466 SAVE_8ROWS coeffq+16*1, 32 2467 LOAD_8ROWS rsp+gprsize+16*3, 16 2468 mova [rsp+gprsize+16*0], m7 2469 mov tx2q, r3 2470 jmp m(idct_8x8_internal_8bpc).pass1_end 2471 2472.pass2: 2473 lea tx2q, [o(m(idct_16x8_internal_8bpc).end)] 2474 lea r3, [dstq+8] 2475 jmp m(idct_8x8_internal_8bpc).pass2_main 2476 2477.end: 2478 LOAD_8ROWS coeffq+16*1, 32 2479 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 2480 mov dstq, r3 2481 jmp m(idct_8x8_internal_8bpc).pass2_main 2482 2483 2484ALIGN function_align 2485cglobal_label .main 2486 mova [rsp+gprsize*2+16*1], m2 2487 mova [rsp+gprsize*2+16*2], m6 2488 mova [rsp+gprsize*2+32*5], m5 2489 2490 mova m6, [o(pd_2048)] 2491 ITX_MULSUB_2W 0, 7, 2, 5, 6, 401, 4076 ;t8a, t15a 2492 ITX_MULSUB_2W 4, 3, 2, 5, 6, 3166, 2598 ;t9a, t14a 2493 psubsw m2, m0, m4 ;t9 2494 paddsw m0, m4 ;t8 2495 psubsw m4, m7, m3 ;t14 2496 paddsw m7, m3 ;t15 2497 ITX_MULSUB_2W 4, 2, 3, 5, 6, 1567, 3784 ;t9a, t14a 2498 mova m3, [rsp+gprsize*2+16*1] 2499 mova m5, [rsp+gprsize*2+32*5] 2500 mova [rsp+gprsize*2+16*1], m2 2501 mova [rsp+gprsize*2+32*5], m4 2502 mova m2, [rsp+gprsize*2+16*2] 2503 mova [rsp+gprsize*2+16*2], m7 2504 ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a 2505 ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a 2506 psubsw m4, m2, m3 ;t10 2507 paddsw m2, m3 ;t11 2508 psubsw m3, m1, m5 ;t13 2509 paddsw m1, m5 ;t12 2510 ITX_MULSUB_2W 3, 4, 7, 5, 6, m3784, 1567 ;t10a, t13a 2511 mova m7, [rsp+gprsize*2+32*5] 2512 psubsw m6, m0, m2 ;t11a 2513 paddsw m0, m2 ;t8a 2514 paddsw m2, m7, m3 ;t9 2515 psubsw m7, m3 ;t10 2516 mova m5, [rsp+gprsize*2+16*0] 2517 psubsw m3, m5, m0 ;out8 2518 paddsw m0, m5 ;out7 2519 mova [rsp+gprsize*2+32*5], m0 2520 mova m5, [rsp+gprsize*2+16*9] 2521 psubsw m0, m5, m2 ;out9 2522 paddsw m2, m5 ;out6 2523 mova [rsp+gprsize*2+16*0], m0 2524 mova [rsp+gprsize*2+16*9], m2 2525 mova m0, [rsp+gprsize*2+16*1] 2526 mova m2, [rsp+gprsize*2+16*2] 2527 mova [rsp+gprsize*2+16*1], m3 2528 psubsw m5, m0, m4 ;t13 2529 paddsw m0, m4 ;t14 2530 mova m3, [o(pd_2048)] 2531 psubsw m4, m2, m1 ;t12a 2532 paddsw m1, m2 ;t15a 2533 mova [rsp+gprsize*2+16*2], m1 2534 ITX_MULSUB_2W 5, 7, 1, 2, 3, 2896, 2896 ;t10a, t13a 2535 ITX_MULSUB_2W 4, 6, 1, 2, 3, 2896, 2896 ;t11, t12 2536 mova m3, [rsp+gprsize*2+16*8] 2537 psubsw m2, m3, m5 ;out10 2538 paddsw m3, m5 ;out5 2539 mova m5, [rsp+gprsize*2+16*7] 2540 mova [rsp+gprsize*2+16*8], m3 2541 psubsw m3, m5, m4 ;out11 2542 paddsw m5, m4 ;out4 2543 mova m4, [rsp+gprsize*2+16*6] 2544 mova [rsp+gprsize*2+16*7], m5 2545 paddsw m5, m4, m6 ;out3 2546 psubsw m4, m6 ;out12 2547 mova m6, [rsp+gprsize*2+16*5] 2548 mova [rsp+gprsize*2+16*6], m5 2549 psubsw m5, m6, m7 ;out13 2550 paddsw m6, m7 ;out2 2551 mova m7, [rsp+gprsize*2+16*4] 2552 mova [rsp+gprsize*2+16*5], m6 2553 psubsw m6, m7, m0 ;out14 2554 paddsw m7, m0 ;out1 2555 mova m1, [rsp+gprsize*2+16*2] 2556 mova m0, [rsp+gprsize*2+16*3] 2557 mova [rsp+gprsize*2+16*4], m7 2558 psubsw m7, m0, m1 ;out15 2559 paddsw m0, m1 ;out0 2560 mova [rsp+gprsize*2+16*3], m0 2561 mova m1, [rsp+gprsize*2+16*0] 2562 mova m0, [rsp+gprsize*2+16*1] 2563 mova [rsp+gprsize*2+16*0], m7 2564 ret 2565 2566INV_TXFM_16X8_FN adst, dct 2567INV_TXFM_16X8_FN adst, adst 2568INV_TXFM_16X8_FN adst, flipadst 2569INV_TXFM_16X8_FN adst, identity 2570 2571cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2572 mova m7, [o(pw_2896x8)] 2573 pmulhrsw m0, m7, [coeffq+16*0 ] 2574 pmulhrsw m1, m7, [coeffq+16*1 ] 2575 pmulhrsw m2, m7, [coeffq+16*14] 2576 pmulhrsw m3, m7, [coeffq+16*15] 2577 mova [rsp+gprsize+16*7], m0 2578 mova [rsp+gprsize+16*8], m1 2579 mova [rsp+gprsize+16*9], m2 2580 mova [rsp+gprsize+32*5], m3 2581 pmulhrsw m0, m7, [coeffq+16*6 ] 2582 pmulhrsw m1, m7, [coeffq+16*7 ] 2583 pmulhrsw m2, m7, [coeffq+16*8 ] 2584 pmulhrsw m3, m7, [coeffq+16*9 ] 2585 mova [rsp+gprsize+16*3], m2 2586 mova [rsp+gprsize+16*4], m3 2587 mova [rsp+gprsize+16*5], m0 2588 mova [rsp+gprsize+16*6], m1 2589 pmulhrsw m0, m7, [coeffq+16*2 ] 2590 pmulhrsw m1, m7, [coeffq+16*3 ] 2591 pmulhrsw m2, m7, [coeffq+16*4 ] 2592 pmulhrsw m3, m7, [coeffq+16*5 ] 2593 pmulhrsw m4, m7, [coeffq+16*10] 2594 pmulhrsw m5, m7, [coeffq+16*11] 2595 pmulhrsw m6, m7, [coeffq+16*12] 2596 pmulhrsw m7, [coeffq+16*13] 2597 2598 call .main 2599 call .main_pass1_end 2600 mov r3, tx2q 2601 lea tx2q, [o(m(iadst_16x8_internal_8bpc).pass1_end)] 2602 jmp m(iadst_8x8_internal_8bpc).pass1_end 2603 2604.pass1_end: 2605 SAVE_8ROWS coeffq+16*1, 32 2606 LOAD_8ROWS rsp+gprsize+16*3, 16 2607 mova [rsp+gprsize+16*0], m7 2608 mov tx2q, r3 2609 jmp m(iadst_8x8_internal_8bpc).pass1_end 2610 2611.pass2: 2612 lea tx2q, [o(m(iadst_16x8_internal_8bpc).end)] 2613 lea r3, [dstq+8] 2614 jmp m(iadst_8x8_internal_8bpc).pass2_main 2615 2616.end: 2617 LOAD_8ROWS coeffq+16*1, 32 2618 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 2619 mov dstq, r3 2620 jmp m(iadst_8x8_internal_8bpc).pass2_main 2621 2622ALIGN function_align 2623cglobal_label .main 2624 mova [rsp+gprsize*2+16*0], m1 2625 mova [rsp+gprsize*2+16*1], m2 2626 mova [rsp+gprsize*2+16*2], m6 2627 2628 mova m6, [o(pd_2048)] 2629 ITX_MULSUB_2W 7, 0, 1, 2, 6, 995, 3973 ;t3, t2 2630 ITX_MULSUB_2W 3, 4, 1, 2, 6, 3513, 2106 ;t11, t10 2631 psubsw m1, m0, m4 ;t10a 2632 paddsw m0, m4 ;t2a 2633 psubsw m4, m7, m3 ;t11a 2634 paddsw m3, m7 ;t3a 2635 ITX_MULSUB_2W 1, 4, 7, 2, 6, 3406, 2276 ;t11, t10 2636 mova m2, [rsp+gprsize*2+16*0] ;in3 2637 mova m7, [rsp+gprsize*2+16*1] ;in4 2638 mova [rsp+gprsize*2+16*0], m1 ;t11 2639 mova [rsp+gprsize*2+16*1], m4 ;t10 2640 mova m1, [rsp+gprsize*2+16*2] ;in12 2641 mova [rsp+gprsize*2+16*2], m0 ;t2a 2642 ITX_MULSUB_2W 5, 7, 0, 4, 6, 1751, 3703 ;t5, t4 2643 ITX_MULSUB_2W 2, 1, 0, 4, 6, 3857, 1380 ;t13, t12 2644 psubsw m0, m7, m1 ;t12a 2645 paddsw m1, m7 ;t4a 2646 psubsw m4, m5, m2 ;t13a 2647 paddsw m5, m2 ;t5a 2648 ITX_MULSUB_2W 4, 0, 7, 2, 6, 4017, 799 ;t12, t13 2649 mova m2, [rsp+gprsize*2+16*8] ;in1 2650 mova m7, [rsp+gprsize*2+16*9] ;in14 2651 mova [rsp+gprsize*2+16*8], m4 ;t12 2652 mova [rsp+gprsize*2+16*9], m0 ;t13 2653 mova m4, [rsp+gprsize*2+16*4] ;in9 2654 mova m0, [rsp+gprsize*2+16*5] ;in6 2655 mova [rsp+gprsize*2+16*4], m1 ;t4a 2656 mova [rsp+gprsize*2+16*5], m5 ;t5a 2657 ITX_MULSUB_2W 2, 7, 1, 5, 6, 4052, 601 ;t15, t14 2658 ITX_MULSUB_2W 4, 0, 1, 5, 6, 2440, 3290 ;t7, t6 2659 psubsw m1, m0, m7 ;t14a 2660 paddsw m0, m7 ;t6a 2661 psubsw m5, m4, m2 ;t15a 2662 paddsw m4, m2 ;t7a 2663 ITX_MULSUB_2W 5, 1, 7, 2, 6, 2276, 3406 ;t14, t15 2664 mova m2, [rsp+gprsize*2+16*2] ;t2a 2665 mova [rsp+gprsize*2+16*2], m5 ;t14 2666 psubsw m7, m2, m0 ;t6 2667 paddsw m2, m0 ;t2 2668 psubsw m0, m3, m4 ;t7 2669 paddsw m3, m4 ;t3 2670 ITX_MULSUB_2W 0, 7, 4, 5, 6, 3784, 1567 ;t6a, t7a 2671 mova m4, [rsp+gprsize*2+16*7] ;in0 2672 mova m5, [rsp+gprsize*2+32*5] ;in15 2673 mova [rsp+gprsize*2+16*7], m3 ;t3 2674 mova [rsp+gprsize*2+32*5], m1 ;t15 2675 mova m1, [rsp+gprsize*2+16*6] ;in7 2676 mova m3, [rsp+gprsize*2+16*3] ;in8 2677 mova [rsp+gprsize*2+16*6], m7 ;t7a 2678 mova [rsp+gprsize*2+16*3], m0 ;t6a 2679 ITX_MULSUB_2W 5, 4, 0, 7, 6, 201, 4091 ;t1, t0 2680 ITX_MULSUB_2W 1, 3, 0, 7, 6, 3035, 2751 ;t9, t8 2681 psubsw m0, m4, m3 ;t8a 2682 paddsw m4, m3 ;t0a 2683 psubsw m3, m5, m1 ;t9a 2684 paddsw m5, m1 ;t1a 2685 ITX_MULSUB_2W 0, 3, 1, 7, 6, 799, 4017 ;t9, t8 2686 mova m1, [rsp+gprsize*2+16*4] ;t4a 2687 mova m7, [rsp+gprsize*2+16*5] ;t5a 2688 mova [rsp+gprsize*2+16*4], m3 ;t8 2689 mova [rsp+gprsize*2+16*5], m0 ;t9 2690 psubsw m0, m4, m1 ;t4 2691 paddsw m4, m1 ;t0 2692 psubsw m3, m5, m7 ;t5 2693 paddsw m5, m7 ;t1 2694 ITX_MULSUB_2W 0, 3, 1, 7, 6, 1567, 3784 ;t5a, t4a 2695 mova m7, [rsp+gprsize*2+16*3] ;t6a 2696 psubsw m1, m4, m2 ;t2a 2697 paddsw m4, m2 ;out0 2698 mova [rsp+gprsize*2+16*3], m4 ;out0 2699 mova m4, [rsp+gprsize*2+16*6] ;t7a 2700 psubsw m2, m3, m7 ;t6 2701 paddsw m3, m7 ;-out3 2702 mova [rsp+gprsize*2+16*6], m3 ;-out3 2703 psubsw m3, m0, m4 ;t7 2704 paddsw m0, m4 ;out12 2705 mova [rsp+gprsize*2+16*12], m3 2706 mova m3, [rsp+gprsize*2+16*7] ;t3 2707 mova [rsp+gprsize*2+16* 7], m2 ;out4 2708 psubsw m2, m5, m3 ;t3a 2709 paddsw m5, m3 ;-out15 2710 mova [rsp+gprsize*2+16*11], m2 2711 mova m2, [rsp+gprsize*2+32*5] ;t15 2712 mova [rsp+gprsize*2+16*10], m1 ;-out7 2713 mova m1, [rsp+gprsize*2+16*0] ;t11 2714 mova [rsp+gprsize*2+16*0 ], m5 ;-out15 2715 mova m3, [rsp+gprsize*2+16*1] ;t10 2716 mova [rsp+gprsize*2+16*1 ], m4 ;-out11 2717 mova m4, [rsp+gprsize*2+16*2] ;t14 2718 mova [rsp+gprsize*2+16*2 ], m0 ;out12 2719 psubsw m0, m3, m4 ;t14a 2720 paddsw m3, m4 ;t10a 2721 psubsw m5, m1, m2 ;t15a 2722 paddsw m1, m2 ;t11a 2723 ITX_MULSUB_2W 5, 0, 2, 4, 6, 3784, 1567 ;t14, t15 2724 mova m2, [rsp+gprsize*2+16*4] ;t8 2725 mova m4, [rsp+gprsize*2+16*5] ;t9 2726 mova [rsp+gprsize*2+16*4], m3 ;t10a 2727 mova [rsp+gprsize*2+16*5], m1 ;t11a 2728 mova m3, [rsp+gprsize*2+16*8] ;t12 2729 mova m1, [rsp+gprsize*2+16*9] ;t13 2730 mova [rsp+gprsize*2+16*8], m5 ;t14 2731 mova [rsp+gprsize*2+16*9], m0 ;t15 2732 psubsw m5, m2, m3 ;t12a 2733 paddsw m2, m3 ;t8a 2734 psubsw m0, m4, m1 ;t13a 2735 paddsw m4, m1 ;t9a 2736 ITX_MULSUB_2W 5, 0, 1, 3, 6, 1567, 3784 ;t13, t12 2737 mova m6, [rsp+gprsize*2+16*4] ;t10a 2738 mova m1, [rsp+gprsize*2+16*5] ;t11a 2739 psubsw m3, m2, m6 ;t10 2740 paddsw m2, m6 ;-out1 2741 paddsw m6, m4, m1 ;out14 2742 psubsw m4, m1 ;t11 2743 mova [rsp+gprsize*2+16*14], m4 2744 mova [rsp+gprsize*2+16* 4], m2 ;-out1 2745 mova m4, [rsp+gprsize*2+16*8] ;t14 2746 mova m2, [rsp+gprsize*2+16*9] ;t15 2747 mova [rsp+gprsize*2+16* 9], m3 ;out6 2748 psubsw m3, m0, m4 ;t14a 2749 paddsw m0, m4 ;out2 2750 psubsw m4, m5, m2 ;t15a 2751 paddsw m5, m2 ;-out13 2752 mova [rsp+gprsize*2+16* 5], m0 ;out2 2753 ret 2754ALIGN function_align 2755.main_pass1_end: 2756 mova m0, [rsp+gprsize*2+16*14] 2757 mova [rsp+gprsize*2+16*14], m5 2758 mova [rsp+gprsize*2+16*15], m6 2759 mova m5, [o(pw_2896_2896)] 2760 mova m6, [o(pw_2896_m2896)] 2761 mova m7, [o(pd_2048)] 2762 punpcklwd m2, m3, m4 2763 punpckhwd m3, m4 2764 pmaddwd m4, m5, m2 2765 pmaddwd m2, m6 2766 pmaddwd m1, m5, m3 2767 pmaddwd m3, m6 2768 REPX {paddd x, m7}, m4, m2, m1, m3 2769 REPX {psrad x, 12}, m4, m1, m2, m3 2770 packssdw m4, m1 ;-out5 2771 packssdw m2, m3 ;out10 2772 mova [rsp+gprsize*2+16* 8], m4 2773 mova m3, [rsp+gprsize*2+16* 9] 2774 punpcklwd m1, m3, m0 2775 punpckhwd m3, m0 2776 pmaddwd m0, m5, m1 2777 pmaddwd m1, m6 2778 pmaddwd m4, m5, m3 2779 pmaddwd m3, m6 2780 REPX {paddd x, m7}, m0, m1, m4, m3 2781 REPX {psrad x, 12}, m0, m4, m1, m3 2782 packssdw m0, m4 ;out6 2783 packssdw m1, m3 ;-out9 2784 mova [rsp+gprsize*2+16* 9], m0 2785 mova m0, [rsp+gprsize*2+16* 7] 2786 mova m4, [rsp+gprsize*2+16*12] 2787 punpcklwd m3, m0, m4 2788 punpckhwd m0, m4 2789 pmaddwd m4, m5, m3 2790 pmaddwd m3, m6 2791 pmaddwd m5, m0 2792 pmaddwd m0, m6 2793 REPX {paddd x, m7}, m4, m3, m5, m0 2794 REPX {psrad x, 12}, m4, m5, m3, m0 2795 packssdw m4, m5 ;out4 2796 packssdw m3, m0 ;-out11 2797 mova [rsp+gprsize*2+16* 7], m4 2798 mova m4, [rsp+gprsize*2+16*10] 2799 mova m5, [rsp+gprsize*2+16*11] 2800 punpcklwd m0, m4, m5 2801 punpckhwd m4, m5 2802 pmaddwd m5, m0, [o(pw_2896_2896)] 2803 pmaddwd m0, m6 2804 pmaddwd m6, m4 2805 pmaddwd m4, [o(pw_2896_2896)] 2806 REPX {paddd x, m7}, m5, m0, m6, m4 2807 REPX {psrad x, 12}, m0, m6, m5, m4 2808 packssdw m0, m6 ;out8 2809 packssdw m5, m4 ;-out7 2810 mova [rsp+gprsize*2+16*10], m5 2811 mova m4, [rsp+gprsize*2+16* 2] ;out12 2812 mova m5, [rsp+gprsize*2+16*14] ;-out13 2813 mova m6, [rsp+gprsize*2+16*15] ;out14 2814 ret 2815ALIGN function_align 2816cglobal_label .main_pass2_end 2817 mova m7, [o(pw_2896x8)] 2818 mova m1, [rsp+gprsize*2+16* 9] 2819 mova m2, [rsp+gprsize*2+16*14] 2820 paddsw m0, m1, m2 2821 psubsw m1, m2 2822 pmulhrsw m0, m7 ;out6 2823 pmulhrsw m1, m7 ;-out9 2824 mova [rsp+gprsize*2+16* 9], m0 2825 psubsw m2, m3, m4 2826 paddsw m3, m4 2827 pmulhrsw m2, m7 ;out10 2828 pmulhrsw m3, m7 ;-out5 2829 mova [rsp+gprsize*2+16* 8], m3 2830 mova m3, [rsp+gprsize*2+16* 7] 2831 mova m4, [rsp+gprsize*2+16*12] 2832 paddsw m0, m3, m4 2833 psubsw m3, m4 2834 pmulhrsw m0, m7 ;out4 2835 pmulhrsw m3, m7 ;-out11 2836 mova [rsp+gprsize*2+16* 7], m0 2837 mova m0, [rsp+gprsize*2+16*10] 2838 paddsw m4, m0, [rsp+gprsize*2+16*11] 2839 psubsw m0, [rsp+gprsize*2+16*11] 2840 pmulhrsw m4, m7 ;-out7 2841 pmulhrsw m0, m7 ;out8 2842 mova [rsp+gprsize*2+16*10], m4 2843 mova m4, [rsp+gprsize*2+16*2 ] ;out12 2844 ret 2845 2846INV_TXFM_16X8_FN flipadst, dct 2847INV_TXFM_16X8_FN flipadst, adst 2848INV_TXFM_16X8_FN flipadst, flipadst 2849INV_TXFM_16X8_FN flipadst, identity 2850 2851cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2852 mova m7, [o(pw_2896x8)] 2853 pmulhrsw m0, m7, [coeffq+16*0 ] 2854 pmulhrsw m1, m7, [coeffq+16*1 ] 2855 pmulhrsw m2, m7, [coeffq+16*14] 2856 pmulhrsw m3, m7, [coeffq+16*15] 2857 mova [rsp+gprsize+16*7], m0 2858 mova [rsp+gprsize+16*8], m1 2859 mova [rsp+gprsize+16*9], m2 2860 mova [rsp+gprsize+32*5], m3 2861 pmulhrsw m0, m7, [coeffq+16*6 ] 2862 pmulhrsw m1, m7, [coeffq+16*7 ] 2863 pmulhrsw m2, m7, [coeffq+16*8 ] 2864 pmulhrsw m3, m7, [coeffq+16*9 ] 2865 mova [rsp+gprsize+16*3], m2 2866 mova [rsp+gprsize+16*4], m3 2867 mova [rsp+gprsize+16*5], m0 2868 mova [rsp+gprsize+16*6], m1 2869 pmulhrsw m0, m7, [coeffq+16*2 ] 2870 pmulhrsw m1, m7, [coeffq+16*3 ] 2871 pmulhrsw m2, m7, [coeffq+16*4 ] 2872 pmulhrsw m3, m7, [coeffq+16*5 ] 2873 pmulhrsw m4, m7, [coeffq+16*10] 2874 pmulhrsw m5, m7, [coeffq+16*11] 2875 pmulhrsw m6, m7, [coeffq+16*12] 2876 pmulhrsw m7, [coeffq+16*13] 2877 2878 call m(iadst_16x8_internal_8bpc).main 2879 call m(iadst_16x8_internal_8bpc).main_pass1_end 2880 2881 mova m7, [rsp+gprsize+16*0] 2882 SAVE_8ROWS coeffq+16*0, 32 2883 LOAD_8ROWS rsp+gprsize+16*3, 16 2884 mova [rsp+gprsize+16*0], m7 2885 mov r3, tx2q 2886 lea tx2q, [o(m(iflipadst_16x8_internal_8bpc).pass1_end)] 2887 jmp m(iflipadst_8x8_internal_8bpc).pass1_end 2888 2889.pass1_end: 2890 SAVE_8ROWS coeffq+16*1, 32 2891 LOAD_8ROWS coeffq+16*0, 32 2892 mova [rsp+gprsize+16*0], m7 2893 mov tx2q, r3 2894 jmp m(iflipadst_8x8_internal_8bpc).pass1_end 2895 2896.pass2: 2897 lea tx2q, [o(m(iflipadst_16x8_internal_8bpc).end)] 2898 lea r3, [dstq+8] 2899 jmp m(iflipadst_8x8_internal_8bpc).pass2_main 2900 2901.end: 2902 LOAD_8ROWS coeffq+16*1, 32 2903 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 2904 mov dstq, r3 2905 jmp m(iflipadst_8x8_internal_8bpc).pass2_main 2906 2907 2908INV_TXFM_16X8_FN identity, dct 2909INV_TXFM_16X8_FN identity, adst 2910INV_TXFM_16X8_FN identity, flipadst 2911INV_TXFM_16X8_FN identity, identity 2912 2913cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 2914 add coeffq, 16*16 2915 mova m4, [coeffq-16*7] 2916 mova m5, [coeffq-16*5] 2917 mova m6, [coeffq-16*3] 2918 mova m7, [coeffq-16*1] 2919 mov r3, tx2q 2920 lea tx2q, [o(m(iidentity_16x8_internal_8bpc).pass1_end)] 2921 2922.pass1: 2923 mova m0, [o(pw_2896x8)] 2924 mova m2, [o(pw_1697x16)] 2925 mova m3, [o(pw_16384)] 2926 sub coeffq, 8*16 2927 REPX {pmulhrsw x, m0}, m4, m5, m6, m7 2928 pmulhrsw m1, m2, m4 2929 pmulhrsw m1, m3 2930 paddsw m1, m4 ; 1 2931 pmulhrsw m4, m2, m5 2932 pmulhrsw m4, m3 2933 paddsw m4, m5 ; 3 2934 pmulhrsw m5, m2, m6 2935 pmulhrsw m5, m3 2936 paddsw m5, m6 ; 5 2937 pmulhrsw m6, m2, m7 2938 pmulhrsw m6, m3 2939 paddsw m7, m6 ; 7 2940 pmulhrsw m6, m0, [coeffq+16*6] 2941 mova [rsp+gprsize+16*0], m4 2942 pmulhrsw m4, m2, m6 2943 pmulhrsw m4, m3 2944 paddsw m6, m4 ; 6 2945 pmulhrsw m4, m0, [coeffq+16*4] 2946 mova [rsp+gprsize+16*1], m6 2947 pmulhrsw m6, m2, m4 2948 pmulhrsw m6, m3 2949 paddsw m4, m6 ; 4 2950 pmulhrsw m6, m0, [coeffq+16*2] 2951 pmulhrsw m0, [coeffq+16*0] 2952 pmulhrsw m2, m6 2953 pmulhrsw m2, m3 2954 paddsw m2, m6 ; 2 2955 pmulhrsw m6, m0, [o(pw_1697x16)] 2956 pmulhrsw m6, m3 2957 mova m3, [rsp+gprsize+16*0] 2958 paddsw m0, m6 2959 jmp m(idct_8x8_internal_8bpc).pass1_end3 2960 2961.pass1_end: 2962 mova [coeffq+16*1], m4 2963 mova [coeffq+16*3], m5 2964 mova [coeffq+16*5], m6 2965 mova [coeffq+16*7], m7 2966 mova m4, [coeffq-16*7] 2967 mova m5, [coeffq-16*5] 2968 mova m6, [coeffq-16*3] 2969 mova m7, [coeffq-16*1] 2970 mova [coeffq-16*7], m0 2971 mova [coeffq-16*5], m1 2972 mova [coeffq-16*3], m2 2973 mova [coeffq-16*1], m3 2974 mov tx2q, r3 2975 jmp .pass1 2976 2977.pass2: 2978 lea tx2q, [o(m(iidentity_16x8_internal_8bpc).end)] 2979 lea r3, [dstq+8] 2980 jmp m(iidentity_8x8_internal_8bpc).end 2981 2982.end: 2983 LOAD_8ROWS coeffq+16*1, 32 2984 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 2985 mov dstq, r3 2986 jmp m(iidentity_8x8_internal_8bpc).end 2987 2988 2989%macro INV_TXFM_16X16_FN 2 ; type1, type2 2990 INV_TXFM_FN %1, %2, 16x16, 8, 16*16 2991%ifidn %1_%2, dct_dct 2992 movd m1, [o(pw_2896x8)] 2993 pmulhrsw m0, m1, [coeffq] 2994 movd m2, [o(pw_8192)] 2995 mov [coeffq], eobd 2996 mov r2d, 8 2997 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)] 2998 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly 2999.end: 3000 RET 3001%endif 3002%endmacro 3003 3004INV_TXFM_16X16_FN dct, dct 3005INV_TXFM_16X16_FN dct, adst 3006INV_TXFM_16X16_FN dct, flipadst 3007INV_TXFM_16X16_FN dct, identity 3008 3009cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 3010 LOAD_8ROWS coeffq+16*1, 64 3011 call m(idct_8x8_internal_8bpc).main 3012 SAVE_7ROWS rsp+gprsize+16*3, 16 3013 LOAD_8ROWS coeffq+16*3, 64 3014 call m(idct_16x8_internal_8bpc).main 3015 mov r3, tx2q 3016 lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end)] 3017 mova m7, [o(pw_8192)] 3018 jmp m(idct_8x8_internal_8bpc).pass1_end1 3019 3020.pass1_end: 3021 SAVE_8ROWS coeffq+16*17, 32 3022 LOAD_8ROWS rsp+gprsize+16*3, 16 3023 mova [rsp+gprsize+16*0], m7 3024 lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end1)] 3025 mova m7, [o(pw_8192)] 3026 jmp m(idct_8x8_internal_8bpc).pass1_end1 3027 3028.pass1_end1: 3029 SAVE_8ROWS coeffq+16*1, 32 3030 LOAD_8ROWS coeffq+16*0, 64 3031 call m(idct_8x8_internal_8bpc).main 3032 SAVE_7ROWS rsp+gprsize+16*3, 16 3033 LOAD_8ROWS coeffq+16*2, 64 3034 call m(idct_16x8_internal_8bpc).main 3035 lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end2)] 3036 mova m7, [o(pw_8192)] 3037 jmp m(idct_8x8_internal_8bpc).pass1_end1 3038 3039.pass1_end2: 3040 SAVE_8ROWS coeffq+16*16, 32 3041 LOAD_8ROWS rsp+gprsize+16*3, 16 3042 mova [rsp+gprsize+16*0], m7 3043 mov tx2q, r3 3044 mova m7, [o(pw_8192)] 3045 jmp m(idct_8x8_internal_8bpc).pass1_end1 3046 3047.pass2: 3048 lea tx2q, [o(m(idct_16x16_internal_8bpc).end)] 3049 jmp m(idct_8x16_internal_8bpc).pass2_pre 3050 3051.end: 3052 LOAD_8ROWS rsp+gprsize+16*3, 16 3053 mova [rsp+gprsize+16*0], m7 3054 lea tx2q, [o(m(idct_16x16_internal_8bpc).end1)] 3055 mov dstq, r3 3056 lea r3, [dstq+8] 3057 jmp m(idct_8x8_internal_8bpc).end 3058 3059.end1: 3060 pxor m7, m7 3061 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 3062 3063 add coeffq, 32*8 3064 mov dstq, r3 3065 3066 mova m0, [coeffq+16*0 ] 3067 mova m1, [coeffq+16*4 ] 3068 mova m2, [coeffq+16*8 ] 3069 mova m3, [coeffq+16*12] 3070 mova m4, [coeffq+16*1 ] 3071 mova m5, [coeffq+16*5 ] 3072 mova m6, [coeffq+16*9 ] 3073 mova m7, [coeffq+16*13] 3074 lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] 3075 jmp m(idct_8x16_internal_8bpc).pass2_main 3076 3077 3078%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0 3079 mova m0, [coeffq+16*1 ] 3080 mova m1, [coeffq+16*3 ] 3081 mova m2, [coeffq+16*29] 3082 mova m3, [coeffq+16*31] 3083 mova [rsp+gprsize+16*7], m0 3084 mova [rsp+gprsize+16*8], m1 3085 mova [rsp+gprsize+16*9], m2 3086 mova [rsp+gprsize+32*5], m3 3087 mova m0, [coeffq+16*13] 3088 mova m1, [coeffq+16*15] 3089 mova m2, [coeffq+16*17] 3090 mova m3, [coeffq+16*19] 3091 mova [rsp+gprsize+16*3], m2 3092 mova [rsp+gprsize+16*4], m3 3093 mova [rsp+gprsize+16*5], m0 3094 mova [rsp+gprsize+16*6], m1 3095 mova m0, [coeffq+16*5 ] 3096 mova m1, [coeffq+16*7 ] 3097 mova m2, [coeffq+16*9 ] 3098 mova m3, [coeffq+16*11] 3099 mova m4, [coeffq+16*21] 3100 mova m5, [coeffq+16*23] 3101 mova m6, [coeffq+16*25] 3102 mova m7, [coeffq+16*27] 3103%endmacro 3104 3105%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0 3106 mova m0, [coeffq+16*0 ] 3107 mova m1, [coeffq+16*2 ] 3108 mova m2, [coeffq+16*28] 3109 mova m3, [coeffq+16*30] 3110 mova [rsp+gprsize+16*7], m0 3111 mova [rsp+gprsize+16*8], m1 3112 mova [rsp+gprsize+16*9], m2 3113 mova [rsp+gprsize+32*5], m3 3114 mova m0, [coeffq+16*12] 3115 mova m1, [coeffq+16*14] 3116 mova m2, [coeffq+16*16] 3117 mova m3, [coeffq+16*18] 3118 mova [rsp+gprsize+16*3], m2 3119 mova [rsp+gprsize+16*4], m3 3120 mova [rsp+gprsize+16*5], m0 3121 mova [rsp+gprsize+16*6], m1 3122 mova m0, [coeffq+16*4 ] 3123 mova m1, [coeffq+16*6 ] 3124 mova m2, [coeffq+16*8 ] 3125 mova m3, [coeffq+16*10] 3126 mova m4, [coeffq+16*20] 3127 mova m5, [coeffq+16*22] 3128 mova m6, [coeffq+16*24] 3129 mova m7, [coeffq+16*26] 3130%endmacro 3131 3132INV_TXFM_16X16_FN adst, dct 3133INV_TXFM_16X16_FN adst, adst 3134INV_TXFM_16X16_FN adst, flipadst 3135 3136cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 3137 ITX_16X16_ADST_LOAD_ODD_COEFS 3138 call m(iadst_16x8_internal_8bpc).main 3139 call m(iadst_16x8_internal_8bpc).main_pass1_end 3140 3141 mov r3, tx2q 3142 lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end)] 3143 mova m7, [o(pw_8192)] 3144 jmp m(iadst_8x8_internal_8bpc).pass1_end1 3145 3146.pass1_end: 3147 SAVE_8ROWS coeffq+16*17, 32 3148 LOAD_8ROWS rsp+gprsize+16*3, 16 3149 mova [rsp+gprsize+16*0], m7 3150 lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end1)] 3151 mova m7, [o(pw_8192)] 3152 jmp m(iadst_8x8_internal_8bpc).pass1_end1 3153 3154.pass1_end1: 3155 SAVE_8ROWS coeffq+16*1, 32 3156 ITX_16X16_ADST_LOAD_EVEN_COEFS 3157 call m(iadst_16x8_internal_8bpc).main 3158 call m(iadst_16x8_internal_8bpc).main_pass1_end 3159 3160 lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end2)] 3161 mova m7, [o(pw_8192)] 3162 jmp m(iadst_8x8_internal_8bpc).pass1_end1 3163 3164.pass1_end2: 3165 SAVE_8ROWS coeffq+16*16, 32 3166 LOAD_8ROWS rsp+gprsize+16*3, 16 3167 mova [rsp+gprsize+16*0], m7 3168 mov tx2q, r3 3169 mova m7, [o(pw_8192)] 3170 jmp m(iadst_8x8_internal_8bpc).pass1_end1 3171 3172.pass2: 3173 lea tx2q, [o(m(iadst_16x16_internal_8bpc).end)] 3174 jmp m(iadst_8x16_internal_8bpc).pass2_pre 3175 3176.end: 3177 LOAD_8ROWS rsp+gprsize+16*3, 16 3178 mova [rsp+gprsize+16*0], m7 3179 lea tx2q, [o(m(iadst_16x16_internal_8bpc).end1)] 3180 mov dstq, r3 3181 lea r3, [dstq+8] 3182 jmp m(iadst_8x8_internal_8bpc).end 3183 3184.end1: 3185 pxor m7, m7 3186 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 3187 3188 add coeffq, 32*8 3189 mov dstq, r3 3190 3191 mova m4, [coeffq+16*0 ] 3192 mova m5, [coeffq+16*2 ] 3193 mova m0, [coeffq+16*4 ] 3194 mova m1, [coeffq+16*6 ] 3195 mova m2, [coeffq+16*8 ] 3196 mova m3, [coeffq+16*10] 3197 mova m6, [coeffq+16*12] 3198 mova m7, [coeffq+16*14] 3199 mova [rsp+gprsize+16*7], m4 3200 mova [rsp+gprsize+16*8], m5 3201 mova [rsp+gprsize+16*5], m6 3202 mova [rsp+gprsize+16*6], m7 3203 lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] 3204 jmp m(iadst_8x16_internal_8bpc).pass2_main 3205 3206 3207INV_TXFM_16X16_FN flipadst, dct 3208INV_TXFM_16X16_FN flipadst, adst 3209INV_TXFM_16X16_FN flipadst, flipadst 3210 3211cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 3212 ITX_16X16_ADST_LOAD_ODD_COEFS 3213 call m(iadst_16x8_internal_8bpc).main 3214 call m(iadst_16x8_internal_8bpc).main_pass1_end 3215 3216 mov r3, tx2q 3217 lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end)] 3218 mova m7, [o(pw_m8192)] 3219 jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 3220 3221.pass1_end: 3222 SAVE_8ROWS coeffq+16*1, 32 3223 LOAD_8ROWS rsp+gprsize+16*3, 16 3224 mova [rsp+gprsize+16*0], m7 3225 lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end1)] 3226 mova m7, [o(pw_m8192)] 3227 jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 3228 3229.pass1_end1: 3230 SAVE_8ROWS coeffq+16*17, 32 3231 ITX_16X16_ADST_LOAD_EVEN_COEFS 3232 call m(iadst_16x8_internal_8bpc).main 3233 call m(iadst_16x8_internal_8bpc).main_pass1_end 3234 3235 mova m7, [rsp+gprsize+16*0] 3236 SAVE_8ROWS coeffq+16*0, 32 3237 LOAD_8ROWS rsp+gprsize+16*3, 16 3238 mova [rsp+gprsize+16*0], m7 3239 lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end2)] 3240 mova m7, [o(pw_m8192)] 3241 jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 3242 3243.pass1_end2: 3244 SAVE_8ROWS coeffq+16*16, 32 3245 LOAD_8ROWS coeffq+16* 0, 32 3246 mova [rsp+gprsize+16*0], m7 3247 mov tx2q, r3 3248 mova m7, [o(pw_m8192)] 3249 jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 3250 3251.pass2: 3252 lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end)] 3253 lea r3, [dstq+8] 3254 jmp m(iflipadst_8x16_internal_8bpc).pass2_pre 3255 3256.end: 3257 LOAD_8ROWS rsp+gprsize+16*3, 16 3258 mova [rsp+gprsize+16*0], m7 3259 lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end1)] 3260 lea dstq, [dstq+strideq*2] 3261 jmp m(iflipadst_8x8_internal_8bpc).end 3262 3263.end1: 3264 pxor m7, m7 3265 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 3266 3267 add coeffq, 32*8 3268 3269 mova m4, [coeffq+16*0 ] 3270 mova m5, [coeffq+16*2 ] 3271 mova m0, [coeffq+16*4 ] 3272 mova m1, [coeffq+16*6 ] 3273 mova m2, [coeffq+16*8 ] 3274 mova m3, [coeffq+16*10] 3275 mova m6, [coeffq+16*12] 3276 mova m7, [coeffq+16*14] 3277 mova [rsp+gprsize+16*7], m4 3278 mova [rsp+gprsize+16*8], m5 3279 mova [rsp+gprsize+16*5], m6 3280 mova [rsp+gprsize+16*6], m7 3281 3282 lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end2)] 3283 mov dstq, r3 3284 jmp m(iflipadst_8x16_internal_8bpc).pass2_main 3285 3286.end2: 3287 LOAD_8ROWS rsp+gprsize+16*3, 16 3288 mova [rsp+gprsize+16*0], m7 3289 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 3290 lea dstq, [dstq+strideq*2] 3291 jmp m(iflipadst_8x8_internal_8bpc).end 3292 3293 3294%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 3295 pmulhrsw m%2, m%3, m%1 3296 psraw m%2, 1 3297 pavgw m%1, m%2 3298%endmacro 3299 3300INV_TXFM_16X16_FN identity, dct 3301INV_TXFM_16X16_FN identity, identity 3302 3303cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 3304 add coeffq, 16*17 3305 mov r3, tx2q 3306 lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end)] 3307 3308.pass1: 3309 mova m6, [o(pw_1697x16)] 3310 mova m7, [coeffq+32*6] 3311 mova m0, [coeffq+32*0] 3312 mova m1, [coeffq+32*1] 3313 mova m2, [coeffq+32*2] 3314 mova m3, [coeffq+32*3] 3315 mova m4, [coeffq+32*4] 3316 REPX {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4 3317 mova m5, [coeffq+32*5] 3318 mova [rsp+gprsize+16*1], m7 3319 IDTX16B 5, 7, 6 3320 mova m7, [coeffq+32*7] 3321 IDTX16B 7, 6, 6 3322 jmp m(idct_8x8_internal_8bpc).pass1_end3 3323 3324.pass1_end: 3325 SAVE_8ROWS coeffq, 32 3326 sub coeffq, 16 3327 lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end1)] 3328 jmp .pass1 3329 3330.pass1_end1: 3331 SAVE_8ROWS coeffq, 32 3332 sub coeffq, 15*16 3333 lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end2)] 3334 jmp .pass1 3335 3336.pass1_end2: 3337 SAVE_8ROWS coeffq, 32 3338 sub coeffq, 16 3339 mov tx2q, r3 3340 jmp .pass1 3341 3342.pass2: 3343 lea r3, [dstq+8] 3344 lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end1)] 3345 3346.end: 3347 mova [rsp+gprsize+16*0], m7 3348 mova [rsp+gprsize+16*1], m4 3349 mova m7, [o(pw_1697x16)] 3350 REPX {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3 3351 mova m4, [o(pw_2048)] 3352 pmulhrsw m5, m4 3353 pmulhrsw m6, m4 3354 mova [rsp+gprsize+16*2], m5 3355 mova m5, [rsp+gprsize+16*1] 3356 mova [rsp+gprsize+16*1], m6 3357 IDTX16 5, 6, 7 3358 mova m6, [rsp+gprsize+16*0] 3359 IDTX16 6, 7, 7 3360 REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6 3361 pmulhrsw m4, m5 3362 mova [rsp+gprsize+16*0], m6 3363 jmp m(idct_8x8_internal_8bpc).end3 3364 3365.end1: 3366 LOAD_8ROWS coeffq+16*1, 32 3367 lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end2)] 3368 lea dstq, [dstq+strideq*2] 3369 jmp .end 3370 3371.end2: 3372 pxor m7, m7 3373 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 3374 3375 add coeffq, 32*8 3376 LOAD_8ROWS coeffq, 32 3377 lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end3)] 3378 mov dstq, r3 3379 jmp .end 3380 3381.end3: 3382 LOAD_8ROWS coeffq+16*1, 32 3383 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] 3384 lea dstq, [dstq+strideq*2] 3385 jmp .end 3386 3387 3388cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 3389%if ARCH_X86_32 3390 LEA r5, $$ 3391%endif 3392 test eobd, eobd 3393 jz .dconly 3394 call m(idct_8x32_internal_8bpc) 3395 RET 3396 3397.dconly: 3398 movd m1, [o(pw_2896x8)] 3399 pmulhrsw m0, m1, [coeffq] 3400 movd m2, [o(pw_8192)] 3401 mov [coeffq], eobd 3402 pmulhrsw m0, m2 3403 psrlw m2, 2 ;pw_2048 3404 pmulhrsw m0, m1 3405 pmulhrsw m0, m2 3406 pshuflw m0, m0, q0000 3407 punpcklwd m0, m0 3408 mov r3d, 8 3409 lea tx2q, [o(m(inv_txfm_add_dct_dct_8x32_8bpc).end)] 3410 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop 3411 3412.end: 3413 RET 3414 3415 3416 3417cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 3418 %undef cmp 3419 cmp eobd, 106 3420 jle .fast 3421 3422 LOAD_8ROWS coeffq+16*3, 64 3423 call m(idct_8x8_internal_8bpc).main 3424 mova m7, [o(pw_8192)] 3425 lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1)] 3426 jmp m(idct_8x8_internal_8bpc).pass1_end1 3427 3428.pass1: 3429 mova [rsp+gprsize+16*9 ], m0 ;in24 3430 mova [rsp+gprsize+16*10], m4 ;in28 3431 mova [rsp+gprsize+16*17], m2 ;in26 3432 mova [rsp+gprsize+16*18], m6 ;in30 3433 mova [rsp+gprsize+16*31], m1 ;in25 3434 mova [rsp+gprsize+16*30], m3 ;in27 3435 mova [rsp+gprsize+16*27], m5 ;in29 3436 mova [rsp+gprsize+16*34], m7 ;in31 3437 LOAD_8ROWS coeffq+16*2, 64 3438 call m(idct_8x8_internal_8bpc).main 3439 mova m7, [o(pw_8192)] 3440 lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_1)] 3441 jmp m(idct_8x8_internal_8bpc).pass1_end1 3442 3443.pass1_1: 3444 mova [rsp+gprsize+16*7 ], m0 ;in16 3445 mova [rsp+gprsize+16*8 ], m4 ;in20 3446 mova [rsp+gprsize+16*15], m2 ;in18 3447 mova [rsp+gprsize+16*16], m6 ;in22 3448 mova [rsp+gprsize+16*33], m1 ;in17 3449 mova [rsp+gprsize+16*28], m3 ;in19 3450 mova [rsp+gprsize+16*29], m5 ;in21 3451 mova [rsp+gprsize+16*32], m7 ;in23 3452 3453.fast: 3454 LOAD_8ROWS coeffq+16*1, 64 3455 call m(idct_8x8_internal_8bpc).main 3456 mova m7, [o(pw_8192)] 3457 lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end)] 3458 jmp m(idct_8x8_internal_8bpc).pass1_end1 3459 3460.pass1_end: 3461 mova [rsp+gprsize+16*5 ], m0 ;in8 3462 mova [rsp+gprsize+16*6 ], m4 ;in12 3463 mova [rsp+gprsize+16*13], m2 ;in10 3464 mova [rsp+gprsize+16*14], m6 ;in14 3465 mova [rsp+gprsize+16*21], m1 ;in9 3466 mova [rsp+gprsize+16*24], m3 ;in11 3467 mova [rsp+gprsize+16*25], m5 ;in13 3468 mova [rsp+gprsize+16*20], m7 ;in15 3469 LOAD_8ROWS coeffq+16*0, 64 3470 call m(idct_8x8_internal_8bpc).main 3471 mova m7, [o(pw_8192)] 3472 lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end1)] 3473 jmp m(idct_8x8_internal_8bpc).pass1_end1 3474 3475.pass1_end1: 3476 mova [rsp+gprsize+16*11], m2 ;in2 3477 mova [rsp+gprsize+16*12], m6 ;in6 3478 mova [rsp+gprsize+16*19], m1 ;in1 3479 mova [rsp+gprsize+16*26], m3 ;in3 3480 mova [rsp+gprsize+16*23], m5 ;in5 3481 mova [rsp+gprsize+16*22], m7 ;in7 3482 mova m1, m4 ;in4 3483 mova m2, [rsp+gprsize+16*5 ] ;in8 3484 mova m3, [rsp+gprsize+16*6 ] ;in12 3485 3486 cmp eobd, 106 3487 jg .full 3488 3489 pxor m4, m4 3490 REPX {mova x, m4}, m5, m6, m7 3491 call m(idct_8x8_internal_8bpc).main 3492 SAVE_7ROWS rsp+gprsize+16*3 , 16 3493 mova m0, [rsp+gprsize+16*11] 3494 mova m1, [rsp+gprsize+16*12] 3495 mova m2, [rsp+gprsize+16*13] 3496 mova m3, [rsp+gprsize+16*14] 3497 pxor m4, m4 3498 REPX {mova x, m4}, m5, m6, m7 3499 call m(idct_16x8_internal_8bpc).main 3500 mova m7, [rsp+gprsize+16*0] 3501 SAVE_8ROWS rsp+gprsize+16*11, 16 3502 3503 call .main_fast 3504 jmp .pass2 3505 3506.full: 3507 mova m4, [rsp+gprsize+16*7 ] ;in16 3508 mova m5, [rsp+gprsize+16*8 ] ;in20 3509 mova m6, [rsp+gprsize+16*9 ] ;in24 3510 mova m7, [rsp+gprsize+16*10] ;in28 3511 call m(idct_8x8_internal_8bpc).main 3512 SAVE_7ROWS rsp+gprsize+16*3 , 16 3513 LOAD_8ROWS rsp+gprsize+16*11, 16 3514 call m(idct_16x8_internal_8bpc).main 3515 mova m7, [rsp+gprsize+16*0] 3516 SAVE_8ROWS rsp+gprsize+16*11, 16 3517 call .main 3518 3519.pass2: 3520 lea r3, [o(m(idct_8x32_internal_8bpc).end6)] 3521 3522.end: 3523 mova [rsp+gprsize+16*0 ], m7 3524 lea tx2q, [o(m(idct_8x32_internal_8bpc).end2)] 3525 3526.end1: 3527 pxor m7, m7 3528 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \ 3529 8, 9, 10, 11, 12, 13, 14, 15, \ 3530 16, 17, 18, 19, 20, 21, 22, 23, \ 3531 24, 25, 26, 27, 28, 29, 30, 31 3532 3533 jmp tx2q 3534 3535.end2: 3536 lea tx2q, [o(m(idct_8x32_internal_8bpc).end3)] 3537 jmp m(idct_8x8_internal_8bpc).end 3538 3539.end3: 3540 LOAD_8ROWS rsp+gprsize+16*11, 16 3541 mova [rsp+gprsize+16*0 ], m7 3542 lea dstq, [dstq+strideq*2] 3543 lea tx2q, [o(m(idct_8x32_internal_8bpc).end4)] 3544 jmp m(idct_8x8_internal_8bpc).end 3545 3546.end4: 3547 LOAD_8ROWS rsp+gprsize+16*19, 16 3548 mova [rsp+gprsize+16*0 ], m7 3549 lea dstq, [dstq+strideq*2] 3550 lea tx2q, [o(m(idct_8x32_internal_8bpc).end5)] 3551 jmp m(idct_8x8_internal_8bpc).end 3552 3553.end5: 3554 LOAD_8ROWS rsp+gprsize+16*27, 16 3555 mova [rsp+gprsize+16*0 ], m7 3556 lea dstq, [dstq+strideq*2] 3557 mov tx2q, r3 3558 jmp m(idct_8x8_internal_8bpc).end 3559 3560.end6: 3561 ret 3562 3563ALIGN function_align 3564.main_veryfast: 3565 mova m0, [rsp+gprsize*2+16*19] ;in1 3566 pmulhrsw m3, m0, [o(pw_4091x8)] ;t30,t31 3567 pmulhrsw m0, [o(pw_201x8)] ;t16,t17 3568 mova m7, [o(pd_2048)] 3569 mova [rsp+gprsize*2+16*19], m0 ;t16 3570 mova [rsp+gprsize*2+16*34], m3 ;t31 3571 ITX_MULSUB_2W 3, 0, 1, 2, 7, 799, 4017 ;t17a, t30a 3572 mova [rsp+gprsize*2+16*20], m3 ;t17a 3573 mova [rsp+gprsize*2+16*33], m0 ;t30a 3574 mova m1, [rsp+gprsize*2+16*22] ;in7 3575 pmulhrsw m2, m1, [o(pw_3857x8)] ;t28,t29 3576 pmulhrsw m1, [o(pw_m1380x8)] ;t18,t19 3577 mova [rsp+gprsize*2+16*22], m1 ;t19 3578 mova [rsp+gprsize*2+16*31], m2 ;t28 3579 ITX_MULSUB_2W 2, 1, 0, 3, 7, m4017, 799 ;t18a, t29a 3580 mova [rsp+gprsize*2+16*21], m2 ;t18a 3581 mova [rsp+gprsize*2+16*32], m1 ;t29a 3582 mova m0, [rsp+gprsize*2+16*23] ;in5 3583 pmulhrsw m3, m0, [o(pw_3973x8)] ;t26, t27 3584 pmulhrsw m0, [o(pw_995x8)] ;t20, t21 3585 mova [rsp+gprsize*2+16*23], m0 ;t20 3586 mova [rsp+gprsize*2+16*30], m3 ;t27 3587 ITX_MULSUB_2W 3, 0, 1, 2, 7, 3406, 2276 ;t21a, t26a 3588 mova [rsp+gprsize*2+16*24], m3 ;t21a 3589 mova [rsp+gprsize*2+16*29], m0 ;t26a 3590 mova m2, [rsp+gprsize*2+16*26] ;in3 3591 pxor m0, m0 3592 mova m3, m0 3593 pmulhrsw m1, m2, [o(pw_4052x8)] 3594 pmulhrsw m2, [o(pw_m601x8)] 3595 jmp .main2 3596 3597ALIGN function_align 3598.main_fast: ;bottom half is zero 3599 mova m0, [rsp+gprsize*2+16*19] ;in1 3600 mova m1, [rsp+gprsize*2+16*20] ;in15 3601 pmulhrsw m3, m0, [o(pw_4091x8)] ;t31a 3602 pmulhrsw m0, [o(pw_201x8)] ;t16a 3603 pmulhrsw m2, m1, [o(pw_3035x8)] ;t30a 3604 pmulhrsw m1, [o(pw_m2751x8)] ;t17a 3605 mova m7, [o(pd_2048)] 3606 psubsw m4, m0, m1 ;t17 3607 paddsw m0, m1 ;t16 3608 psubsw m5, m3, m2 ;t30 3609 paddsw m3, m2 ;t31 3610 ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a 3611 mova [rsp+gprsize*2+16*19], m0 ;t16 3612 mova [rsp+gprsize*2+16*20], m5 ;t17a 3613 mova [rsp+gprsize*2+16*33], m4 ;t30a 3614 mova [rsp+gprsize*2+16*34], m3 ;t31 3615 mova m0, [rsp+gprsize*2+16*21] ;in9 3616 mova m1, [rsp+gprsize*2+16*22] ;in7 3617 pmulhrsw m3, m0, [o(pw_3703x8)] 3618 pmulhrsw m0, [o(pw_1751x8)] 3619 pmulhrsw m2, m1, [o(pw_3857x8)] 3620 pmulhrsw m1, [o(pw_m1380x8)] 3621 psubsw m4, m1, m0 ;t18 3622 paddsw m0, m1 ;t19 3623 psubsw m5, m2, m3 ;t29 3624 paddsw m3, m2 ;t28 3625 ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a 3626 mova [rsp+gprsize*2+16*21], m5 ;t18a 3627 mova [rsp+gprsize*2+16*22], m0 ;t19 3628 mova [rsp+gprsize*2+16*31], m3 ;t28 3629 mova [rsp+gprsize*2+16*32], m4 ;t29a 3630 mova m0, [rsp+gprsize*2+16*23] ;in5 3631 mova m1, [rsp+gprsize*2+16*24] ;in11 3632 pmulhrsw m3, m0, [o(pw_3973x8)] 3633 pmulhrsw m0, [o(pw_995x8)] 3634 pmulhrsw m2, m1, [o(pw_3513x8)] 3635 pmulhrsw m1, [o(pw_m2106x8)] 3636 psubsw m4, m0, m1 ;t21 3637 paddsw m0, m1 ;t20 3638 psubsw m5, m3, m2 ;t26 3639 paddsw m3, m2 ;t27 3640 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a 3641 mova [rsp+gprsize*2+16*23], m0 ;t20 3642 mova [rsp+gprsize*2+16*24], m5 ;t21a 3643 mova [rsp+gprsize*2+16*29], m4 ;t26a 3644 mova [rsp+gprsize*2+16*30], m3 ;t27 3645 mova m0, [rsp+gprsize*2+16*25] ;in13 3646 mova m2, [rsp+gprsize*2+16*26] ;in3 3647 pmulhrsw m3, m0, [o(pw_3290x8)] 3648 pmulhrsw m0, [o(pw_2440x8)] 3649 pmulhrsw m1, m2, [o(pw_4052x8)] 3650 pmulhrsw m2, [o(pw_m601x8)] 3651 jmp .main2 3652 3653ALIGN function_align 3654.main: 3655 mova m7, [o(pd_2048)] 3656 mova m0, [rsp+gprsize*2+16*19] ;in1 3657 mova m1, [rsp+gprsize*2+16*20] ;in15 3658 mova m2, [rsp+gprsize*2+16*33] ;in17 3659 mova m3, [rsp+gprsize*2+16*34] ;in31 3660 ITX_MULSUB_2W 0, 3, 4, 5, 7, 201, 4091 ;t16a, t31a 3661 ITX_MULSUB_2W 2, 1, 4, 5, 7, 3035, 2751 ;t17a, t30a 3662 psubsw m4, m0, m2 ;t17 3663 paddsw m0, m2 ;t16 3664 psubsw m5, m3, m1 ;t30 3665 paddsw m3, m1 ;t31 3666 ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a 3667 mova [rsp+gprsize*2+16*19], m0 ;t16 3668 mova [rsp+gprsize*2+16*20], m5 ;t17a 3669 mova [rsp+gprsize*2+16*33], m4 ;t30a 3670 mova [rsp+gprsize*2+16*34], m3 ;t31 3671 mova m0, [rsp+gprsize*2+16*21] ;in9 3672 mova m1, [rsp+gprsize*2+16*22] ;in7 3673 mova m2, [rsp+gprsize*2+16*31] ;in25 3674 mova m3, [rsp+gprsize*2+16*32] ;in23 3675 ITX_MULSUB_2W 0, 3, 4, 5, 7, 1751, 3703 ;t18a, t29a 3676 ITX_MULSUB_2W 2, 1, 4, 5, 7, 3857, 1380 ;t19a, t28a 3677 psubsw m4, m2, m0 ;t18 3678 paddsw m0, m2 ;t19 3679 psubsw m5, m1, m3 ;t29 3680 paddsw m3, m1 ;t28 3681 ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a 3682 mova [rsp+gprsize*2+16*21], m5 ;t18a 3683 mova [rsp+gprsize*2+16*22], m0 ;t19 3684 mova [rsp+gprsize*2+16*31], m3 ;t28 3685 mova [rsp+gprsize*2+16*32], m4 ;t29a 3686 mova m0, [rsp+gprsize*2+16*23] ;in5 3687 mova m1, [rsp+gprsize*2+16*24] ;in11 3688 mova m2, [rsp+gprsize*2+16*29] ;in21 3689 mova m3, [rsp+gprsize*2+16*30] ;in27 3690 ITX_MULSUB_2W 0, 3, 4, 5, 7, 995, 3973 ;t20a, t27a 3691 ITX_MULSUB_2W 2, 1, 4, 5, 7, 3513, 2106 ;t21a, t26a 3692 psubsw m4, m0, m2 ;t21 3693 paddsw m0, m2 ;t20 3694 psubsw m5, m3, m1 ;t26 3695 paddsw m3, m1 ;t27 3696 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a 3697 mova [rsp+gprsize*2+16*23], m0 ;t20 3698 mova [rsp+gprsize*2+16*24], m5 ;t21a 3699 mova [rsp+gprsize*2+16*29], m4 ;t26a 3700 mova [rsp+gprsize*2+16*30], m3 ;t27 3701 mova m0, [rsp+gprsize*2+16*25] ;in13 3702 mova m1, [rsp+gprsize*2+16*26] ;in3 3703 mova m2, [rsp+gprsize*2+16*27] ;in29 3704 mova m3, [rsp+gprsize*2+16*28] ;in19 3705 ITX_MULSUB_2W 0, 3, 4, 5, 7, 2440, 3290 ;t22a, t25a 3706 ITX_MULSUB_2W 2, 1, 4, 5, 7, 4052, 601 ;t23a, t24a 3707 3708.main2: 3709 psubsw m4, m2, m0 ;t22 3710 paddsw m0, m2 ;t23 3711 psubsw m5, m1, m3 ;t25 3712 paddsw m3, m1 ;t24 3713 ITX_MULSUB_2W 5, 4, 1, 2, 7, m2276, 3406 ;t22a, t25a 3714 mova m2, [rsp+gprsize*2+16*24] ;t21a 3715 psubsw m1, m5, m2 ;t21 3716 paddsw m5, m2 ;t22 3717 mova [rsp+gprsize*2+16*25], m5 ;t22 3718 mova m2, [rsp+gprsize*2+16*29] ;t26a 3719 psubsw m5, m4, m2 ;t26 3720 paddsw m4, m2 ;t25 3721 mova [rsp+gprsize*2+16*28], m4 ;t25 3722 ITX_MULSUB_2W 5, 1, 2, 4, 7, m3784, 1567 ;t21a, t26a 3723 mova [rsp+gprsize*2+16*24], m5 ;t21a 3724 mova [rsp+gprsize*2+16*29], m1 ;t26a 3725 3726 mova m1, [rsp+gprsize*2+16*23] ;t20 3727 mova m5, [rsp+gprsize*2+16*30] ;t27 3728 psubsw m2, m0, m1 ;t20a 3729 paddsw m0, m1 ;t23a 3730 psubsw m6, m3, m5 ;t27a 3731 paddsw m3, m5 ;t24a 3732 ITX_MULSUB_2W 6, 2, 1, 5, 7, m3784, 1567 ;t20, t27 3733 mova [rsp+gprsize*2+16*26], m0 ;t23a 3734 mova [rsp+gprsize*2+16*27], m3 ;t24a 3735 mova [rsp+gprsize*2+16*30], m2 ;t27 3736 3737 mova m0, [rsp+gprsize*2+16*20] ;t17a 3738 mova m1, [rsp+gprsize*2+16*21] ;t18a 3739 mova m2, [rsp+gprsize*2+16*32] ;t29a 3740 mova m3, [rsp+gprsize*2+16*33] ;t30a 3741 psubsw m4, m0, m1 ;t18 3742 paddsw m0, m1 ;t17 3743 psubsw m5, m3, m2 ;t29 3744 paddsw m3, m2 ;t30 3745 ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t18a, t29a 3746 mova [rsp+gprsize*2+16*20], m0 ;t17 3747 mova [rsp+gprsize*2+16*21], m5 ;t18a 3748 mova [rsp+gprsize*2+16*32], m4 ;t29a 3749 mova [rsp+gprsize*2+16*33], m3 ;t30 3750 mova m0, [rsp+gprsize*2+16*19] ;t16 3751 mova m1, [rsp+gprsize*2+16*22] ;t19 3752 mova m2, [rsp+gprsize*2+16*31] ;t28 3753 mova m3, [rsp+gprsize*2+16*34] ;t31 3754 psubsw m4, m0, m1 ;t19a 3755 paddsw m0, m1 ;t16a 3756 psubsw m5, m3, m2 ;t28a 3757 paddsw m3, m2 ;t31a 3758 ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t19, t28 3759 mova m2, [rsp+gprsize*2+16*15] ;tmp12 3760 psubsw m1, m5, m6 ;t20a 3761 paddsw m5, m6 ;t19a 3762 psubsw m6, m2, m5 ;out19 3763 paddsw m2, m5 ;out12 3764 mova m5, [rsp+gprsize*2+16*30] ;t27 3765 mova [rsp+gprsize*2+16*22], m6 ;out19 3766 mova [rsp+gprsize*2+16*15], m2 ;out12 3767 psubsw m6, m4, m5 ;t27a 3768 paddsw m4, m5 ;t28a 3769 ITX_MULSUB_2W 6, 1, 2, 5, 7, 2896, 2896 ;t20, t27 3770 mova m2, [rsp+gprsize*2+16*6 ] ;tmp3 3771 psubsw m5, m2, m4 ;out28 3772 paddsw m2, m4 ;out3 3773 mova m4, [rsp+gprsize*2+16*14] ;tmp11 3774 mova [rsp+gprsize*2+16*31], m5 ;out28 3775 mova [rsp+gprsize*2+16*6 ], m2 ;out3 3776 psubsw m5, m4, m6 ;out20 3777 paddsw m4, m6 ;out11 3778 mova m2, [rsp+gprsize*2+16*7 ] ;tmp4 3779 mova [rsp+gprsize*2+16*23], m5 ;out20 3780 mova [rsp+gprsize*2+16*14], m4 ;out11 3781 psubsw m5, m2, m1 ;out27 3782 paddsw m2, m1 ;out4 3783 mova m1, [rsp+gprsize*2+16*26] ;t23a 3784 mova m4, [rsp+gprsize*2+16*27] ;t24a 3785 mova [rsp+gprsize*2+16*30], m5 ;out27 3786 mova [rsp+gprsize*2+16*7 ], m2 ;out4 3787 psubsw m5, m0, m1 ;t23 3788 paddsw m0, m1 ;t16 3789 psubsw m2, m3, m4 ;t24 3790 paddsw m3, m4 ;t31 3791 ITX_MULSUB_2W 2, 5, 4, 6, 7, 2896, 2896 ;t23a, t24a 3792 mova m6, [rsp+gprsize*2+16*18] ;tmp15 3793 psubsw m4, m6, m0 ;out16 3794 paddsw m6, m0 ;out15 3795 mova m0, [rsp+gprsize*2+16*3 ] ;tmp0 3796 mova m1, [rsp+gprsize*2+16*11] ;tmp8 3797 mova [rsp+gprsize*2+16*18], m6 ;out15 3798 mova [rsp+gprsize*2+16*19], m4 ;out16 3799 psubsw m6, m0, m3 ;out31 3800 paddsw m0, m3 ;out0 3801 psubsw m4, m1, m2 ;out23 3802 paddsw m1, m2 ;out8 3803 mova m3, [rsp+gprsize*2+16*10] ;tmp7 3804 mova [rsp+gprsize*2+16*34], m6 ;out31 3805 mova [rsp+gprsize*2+16*11], m1 ;out8 3806 mova [rsp+gprsize*2+16*26], m4 ;out23 3807 paddsw m6, m3, m5 ;out7 3808 psubsw m3, m5 ;out24 3809 mova m1, [rsp+gprsize*2+16*20] ;t17 3810 mova m5, [rsp+gprsize*2+16*25] ;t22 3811 mova m2, [rsp+gprsize*2+16*17] ;tmp14 3812 mova [rsp+gprsize*2+16*27], m3 ;out24 3813 psubsw m4, m1, m5 ;t22a 3814 paddsw m1, m5 ;t17a 3815 psubsw m3, m2, m1 ;out17 3816 paddsw m2, m1 ;out14 3817 mova m5, [rsp+gprsize*2+16*28] ;t25 3818 mova m1, [rsp+gprsize*2+16*33] ;t30 3819 mova [rsp+gprsize*2+16*17], m2 ;out14 3820 mova [rsp+gprsize*2+16*20], m3 ;out17 3821 psubsw m2, m1, m5 ;t25a 3822 paddsw m1, m5 ;t30a 3823 ITX_MULSUB_2W 2, 4, 3, 5, 7, 2896, 2896 ;t22, t25 3824 mova m5, [rsp+gprsize*2+16*4 ] ;tmp1 3825 psubsw m3, m5, m1 ;out30 3826 paddsw m5, m1 ;out1 3827 mova m1, [rsp+gprsize*2+16*12] ;tmp9 3828 mova [rsp+gprsize*2+16*33], m3 ;out30 3829 mova [rsp+gprsize*2+16*4 ], m5 ;out1 3830 psubsw m3, m1, m2 ;out22 3831 paddsw m1, m2 ;out9 3832 mova m5, [rsp+gprsize*2+16*9 ] ;tmp6 3833 mova [rsp+gprsize*2+16*25], m3 ;out22 3834 mova [rsp+gprsize*2+16*12], m1 ;out9 3835 psubsw m3, m5, m4 ;out25 3836 paddsw m5, m4 ;out6 3837 mova m4, [rsp+gprsize*2+16*21] ;t18a 3838 mova m1, [rsp+gprsize*2+16*24] ;t21a 3839 mova m2, [rsp+gprsize*2+16*16] ;tmp13 3840 mova [rsp+gprsize*2+16*28], m3 ;out25 3841 mova [rsp+gprsize*2+16*9 ], m5 ;out6 3842 paddsw m3, m4, m1 ;t18 3843 psubsw m4, m1 ;t21 3844 psubsw m5, m2, m3 ;out18 3845 paddsw m2, m3 ;out13 3846 mova m1, [rsp+gprsize*2+16*29] ;t26a 3847 mova m3, [rsp+gprsize*2+16*32] ;t29a 3848 mova [rsp+gprsize*2+16*21], m5 ;out18 3849 mova [rsp+gprsize*2+16*16], m2 ;out13 3850 psubsw m5, m3, m1 ;t26 3851 paddsw m3, m1 ;t29 3852 ITX_MULSUB_2W 5, 4, 1, 2, 7, 2896, 2896 ;t21a, t26a 3853 mova m2, [rsp+gprsize*2+16*5 ] ;tmp2 3854 psubsw m1, m2, m3 ;out29 3855 paddsw m2, m3 ;out2 3856 mova m3, [rsp+gprsize*2+16*13] ;tmp10 3857 mova [rsp+gprsize*2+16*32], m1 ;out29 3858 psubsw m7, m3, m5 ;out21 3859 paddsw m3, m5 ;out10 3860 mova m5, [rsp+gprsize*2+16*8 ] ;tmp5 3861 mova [rsp+gprsize*2+16*24], m7 ;out21 3862 mova [rsp+gprsize*2+16*13], m3 ;out10 3863 psubsw m1, m5, m4 ;out26 3864 paddsw m5, m4 ;out5 3865 mova m7, m6 ;out7 3866 mova m3, [rsp+gprsize*2+16*6 ] ;out3 3867 mova m4, [rsp+gprsize*2+16*7 ] ;out4 3868 mova [rsp+gprsize*2+16*29], m1 ;out26 3869 mova m6, [rsp+gprsize*2+16*9 ] ;out6 3870 mova m1, [rsp+gprsize*2+16*4 ] ;out1 3871 ret 3872 3873 3874cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 3875%if ARCH_X86_32 3876 LEA r5, $$ 3877%endif 3878 test eobd, eobd 3879 jz .dconly 3880 call m(idct_32x8_internal_8bpc) 3881 RET 3882 3883.dconly: 3884 movd m1, [o(pw_2896x8)] 3885 pmulhrsw m0, m1, [coeffq] 3886 movd m2, [o(pw_8192)] 3887 mov [coeffq], eobd 3888 mov r3d, 8 3889 lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] 3890 3891.body: 3892 pmulhrsw m0, m2 3893 movd m2, [o(pw_2048)] ;intentionally rip-relative 3894 pmulhrsw m0, m1 3895 pmulhrsw m0, m2 3896 pshuflw m0, m0, q0000 3897 punpcklwd m0, m0 3898 pxor m5, m5 3899 3900.loop: 3901 mova m1, [dstq+16*0] 3902 mova m3, [dstq+16*1] 3903 punpckhbw m2, m1, m5 3904 punpcklbw m1, m5 3905 punpckhbw m4, m3, m5 3906 punpcklbw m3, m5 3907 paddw m2, m0 3908 paddw m1, m0 3909 paddw m4, m0 3910 paddw m3, m0 3911 packuswb m1, m2 3912 packuswb m3, m4 3913 mova [dstq+16*0], m1 3914 mova [dstq+16*1], m3 3915 add dstq, strideq 3916 dec r3d 3917 jg .loop 3918 jmp tx2q 3919 3920.end: 3921 RET 3922 3923 3924cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 3925 %undef cmp 3926 LOAD_8ROWS coeffq+16*0, 64 3927 call m(idct_8x8_internal_8bpc).main 3928 SAVE_7ROWS rsp+gprsize+16*3, 16 3929 3930 LOAD_8ROWS coeffq+16*2, 64 3931 call m(idct_16x8_internal_8bpc).main 3932 mova m7, [rsp+gprsize+16*0] 3933 SAVE_8ROWS rsp+gprsize+16*11, 16 3934 3935 LOAD_8ROWS coeffq+16*1, 32 3936 mova [rsp+gprsize+16*19], m0 ;in1 3937 mova [rsp+gprsize+16*26], m1 ;in3 3938 mova [rsp+gprsize+16*23], m2 ;in5 3939 mova [rsp+gprsize+16*22], m3 ;in7 3940 mova [rsp+gprsize+16*21], m4 ;in9 3941 mova [rsp+gprsize+16*24], m5 ;in11 3942 mova [rsp+gprsize+16*25], m6 ;in13 3943 mova [rsp+gprsize+16*20], m7 ;in15 3944 3945 cmp eobd, 106 3946 jg .full 3947 call m(idct_8x32_internal_8bpc).main_fast 3948 jmp .pass2 3949 3950.full: 3951 LOAD_8ROWS coeffq+16*17, 32 3952 mova [rsp+gprsize+16*33], m0 ;in17 3953 mova [rsp+gprsize+16*28], m1 ;in19 3954 mova [rsp+gprsize+16*29], m2 ;in21 3955 mova [rsp+gprsize+16*32], m3 ;in23 3956 mova [rsp+gprsize+16*31], m4 ;in25 3957 mova [rsp+gprsize+16*30], m5 ;in27 3958 mova [rsp+gprsize+16*27], m6 ;in29 3959 mova [rsp+gprsize+16*34], m7 ;in31 3960 call m(idct_8x32_internal_8bpc).main 3961 3962.pass2: 3963 mova [rsp+gprsize+16*0 ], m7 3964 lea tx2q, [o(m(idct_32x8_internal_8bpc).end)] 3965 jmp m(idct_8x32_internal_8bpc).end1 3966 3967.end: 3968 mova m7, [o(pw_8192)] 3969 lea tx2q, [o(m(idct_32x8_internal_8bpc).end1)] 3970 jmp m(idct_8x8_internal_8bpc).pass1_end1 3971 3972.end1: 3973 lea r3, [dstq+8] 3974 lea tx2q, [o(m(idct_32x8_internal_8bpc).end2)] 3975 jmp m(idct_8x8_internal_8bpc).pass2_main 3976 3977.end2: 3978 LOAD_8ROWS rsp+gprsize+16*11, 16 3979 mova [rsp+gprsize+16*0 ], m7 3980 mova m7, [o(pw_8192)] 3981 lea tx2q, [o(m(idct_32x8_internal_8bpc).end3)] 3982 jmp m(idct_8x8_internal_8bpc).pass1_end1 3983 3984.end3: 3985 mov dstq, r3 3986 add r3, 8 3987 lea tx2q, [o(m(idct_32x8_internal_8bpc).end4)] 3988 jmp m(idct_8x8_internal_8bpc).pass2_main 3989 3990.end4: 3991 LOAD_8ROWS rsp+gprsize+16*19, 16 3992 mova [rsp+gprsize+16*0 ], m7 3993 mova m7, [o(pw_8192)] 3994 lea tx2q, [o(m(idct_32x8_internal_8bpc).end5)] 3995 jmp m(idct_8x8_internal_8bpc).pass1_end1 3996 3997.end5: 3998 mov dstq, r3 3999 add r3, 8 4000 lea tx2q, [o(m(idct_32x8_internal_8bpc).end6)] 4001 jmp m(idct_8x8_internal_8bpc).pass2_main 4002 4003.end6: 4004 LOAD_8ROWS rsp+gprsize+16*27, 16 4005 mova [rsp+gprsize+16*0 ], m7 4006 mova m7, [o(pw_8192)] 4007 lea tx2q, [o(m(idct_32x8_internal_8bpc).end7)] 4008 jmp m(idct_8x8_internal_8bpc).pass1_end1 4009 4010.end7: 4011 mov dstq, r3 4012 lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] 4013 jmp m(idct_8x8_internal_8bpc).pass2_main 4014 4015.end8: 4016 ret 4017 4018 4019cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 4020 mov r5d, 4 4021 mov tx2d, 2 4022 cmp eobd, 107 4023 cmovns tx2d, r5d 4024 mov r3d, tx2d 4025%if ARCH_X86_32 4026 LEA r5, $$ 4027%endif 4028 lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] 4029.loop: 4030 LOAD_8ROWS coeffq+16*0, 64 4031 paddsw m6, [o(pw_5)] 4032 mova [rsp+16*1], m6 4033 mova m6, [o(pw_5)] 4034 REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7 4035 call m(idct_8x8_internal_8bpc).pass1_end3 4036 REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 4037 mova [rsp+16*2], m5 4038 mova [rsp+16*1], m6 4039 mova [rsp+16*0], m7 4040 call m(idct_8x8_internal_8bpc).end3 4041 lea dstq, [dstq+strideq*2] 4042 pxor m7, m7 4043 REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 4044 add coeffq, 16 4045 dec r3d 4046 jg .loop 4047 RET 4048 4049cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 4050 mov r5d, 4 4051 mov tx2d, 2 4052 cmp eobd, 107 4053 cmovns tx2d, r5d 4054 mov r3d, tx2d 4055%if ARCH_X86_32 4056 LEA r5, $$ 4057%endif 4058 4059.loop: 4060 LOAD_8ROWS coeffq+16*0, 16 4061 pmulhrsw m6, [o(pw_4096)] 4062 mova [rsp+16*1], m6 4063 mova m6, [o(pw_4096)] 4064 REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 4065 lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] 4066 call m(idct_8x8_internal_8bpc).pass1_end3 4067 4068 mov [rsp+16*3], dstq 4069 mova [rsp+16*2], m5 4070 mova [rsp+16*1], m6 4071 mova [rsp+16*0], m7 4072 lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] 4073 call m(idct_8x8_internal_8bpc).end3 4074 4075 add coeffq, 16*8 4076 mov dstq, [rsp+16*3] 4077 lea dstq, [dstq+8] 4078 dec r3d 4079 jg .loop 4080 jnc .loop 4081 RET 4082 4083 4084cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 4085%if ARCH_X86_32 4086 LEA r5, $$ 4087%endif 4088 test eobd, eobd 4089 jz .dconly 4090 call m(idct_16x32_internal_8bpc) 4091 RET 4092 4093.dconly: 4094 movd m1, [o(pw_2896x8)] 4095 pmulhrsw m0, m1, [coeffq] 4096 movd m2, [o(pw_16384)] 4097 mov [coeffq], eobd 4098 pmulhrsw m0, m1 4099 mov r2d, 16 4100 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x32_8bpc).end)] 4101 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly 4102 4103.end: 4104 RET 4105 4106cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 4107 %undef cmp 4108 4109 LOAD_8ROWS coeffq+16*1, 128, 1 4110 call m(idct_8x8_internal_8bpc).main 4111 SAVE_7ROWS rsp+gprsize+16*3, 16 4112 LOAD_8ROWS coeffq+16*5, 128, 1 4113 call m(idct_16x8_internal_8bpc).main 4114 lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end)] 4115 jmp m(idct_8x8_internal_8bpc).pass1_end 4116 4117.pass1_end: 4118 SAVE_8ROWS coeffq+16*33, 64 ;in8~in15 4119 LOAD_8ROWS rsp+gprsize+16*3, 16 4120 mova [rsp+gprsize+16*0], m7 4121 lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end1)] 4122 jmp m(idct_8x8_internal_8bpc).pass1_end 4123 4124.pass1_end1: 4125 mova [coeffq+16*1 ], m0 ;in8 4126 mova [coeffq+16*5 ], m4 ;in12 4127 mova [rsp+gprsize+16*13], m2 ;in10 4128 mova [rsp+gprsize+16*14], m6 ;in14 4129 mova [rsp+gprsize+16*21], m1 ;in9 4130 mova [rsp+gprsize+16*24], m3 ;in11 4131 mova [rsp+gprsize+16*25], m5 ;in13 4132 mova [rsp+gprsize+16*20], m7 ;in15 4133 LOAD_8ROWS coeffq+16*0, 128, 1 4134 call m(idct_8x8_internal_8bpc).main 4135 SAVE_7ROWS rsp+gprsize+16*3, 16 4136 LOAD_8ROWS coeffq+16*4, 128, 1 4137 call m(idct_16x8_internal_8bpc).main 4138 lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end2)] 4139 jmp m(idct_8x8_internal_8bpc).pass1_end 4140 4141.pass1_end2: 4142 SAVE_8ROWS coeffq+16*32, 64 ;in0~in7 4143 LOAD_8ROWS rsp+gprsize+16*3, 16 4144 mova [rsp+gprsize+16*0], m7 4145 lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end3)] 4146 jmp m(idct_8x8_internal_8bpc).pass1_end 4147 4148.pass1_end3: 4149 mova [rsp+gprsize+16*11], m2 ;in2 4150 mova [rsp+gprsize+16*12], m6 ;in6 4151 mova [rsp+gprsize+16*19], m1 ;in1 4152 mova [rsp+gprsize+16*26], m3 ;in3 4153 mova [rsp+gprsize+16*23], m5 ;in5 4154 mova [rsp+gprsize+16*22], m7 ;in7 4155 4156 cmp eobd, 150 4157 jg .full 4158 4159 mova m1, m4 ;in4 4160 mova m2, [coeffq+16*1 ] ;in8 4161 mova m3, [coeffq+16*5 ] ;in12 4162 pxor m4, m4 4163 REPX {mova x, m4}, m5, m6, m7 4164 call m(idct_8x8_internal_8bpc).main 4165 SAVE_7ROWS rsp+gprsize+16*3, 16 4166 mova m0, [rsp+gprsize+16*11] ;in2 4167 mova m1, [rsp+gprsize+16*12] ;in6 4168 mova m2, [rsp+gprsize+16*13] ;in10 4169 mova m3, [rsp+gprsize+16*14] ;in14 4170 pxor m4, m4 4171 REPX {mova x, m4}, m5, m6, m7 4172 call m(idct_16x8_internal_8bpc).main 4173 mova m7, [rsp+gprsize+16*0] 4174 SAVE_8ROWS rsp+gprsize+16*11, 16 4175 4176 call m(idct_8x32_internal_8bpc).main_fast 4177 jmp .pass2 4178 4179.full: 4180 mova [coeffq+16*0 ], m0 ;in0 4181 mova [coeffq+16*4 ], m4 ;in4 4182 4183 LOAD_8ROWS coeffq+16*2, 128, 1 4184 call m(idct_8x8_internal_8bpc).main 4185 SAVE_7ROWS rsp+gprsize+16*3, 16 4186 LOAD_8ROWS coeffq+16*6, 128, 1 4187 call m(idct_16x8_internal_8bpc).main 4188 lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end4)] 4189 jmp m(idct_8x8_internal_8bpc).pass1_end 4190 4191.pass1_end4: 4192 SAVE_8ROWS coeffq+16*34, 64 ;in16~in23 4193 LOAD_8ROWS rsp+gprsize+16*3, 16 4194 mova [rsp+gprsize+16*0], m7 4195 lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end5)] 4196 jmp m(idct_8x8_internal_8bpc).pass1_end 4197 4198.pass1_end5: 4199 mova [coeffq+16*2 ], m0 ;in16 4200 mova [coeffq+16*6 ], m4 ;in20 4201 mova [rsp+gprsize+16*15], m2 ;in18 4202 mova [rsp+gprsize+16*16], m6 ;in22 4203 mova [rsp+gprsize+16*33], m1 ;in17 4204 mova [rsp+gprsize+16*28], m3 ;in19 4205 mova [rsp+gprsize+16*29], m5 ;in21 4206 mova [rsp+gprsize+16*32], m7 ;in23 4207 4208 LOAD_8ROWS coeffq+16*3, 128, 1 4209 call m(idct_8x8_internal_8bpc).main 4210 SAVE_7ROWS rsp+gprsize+16*3, 16 4211 LOAD_8ROWS coeffq+16*7, 128, 1 4212 call m(idct_16x8_internal_8bpc).main 4213 lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end6)] 4214 jmp m(idct_8x8_internal_8bpc).pass1_end 4215 4216.pass1_end6: 4217 SAVE_8ROWS coeffq+16*35, 64 ;in24~in31 4218 LOAD_8ROWS rsp+gprsize+16*3, 16 4219 mova [rsp+gprsize+16*0], m7 4220 lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end7)] 4221 jmp m(idct_8x8_internal_8bpc).pass1_end 4222 4223.pass1_end7: 4224 mova [rsp+gprsize+16*17], m2 ;in26 4225 mova [rsp+gprsize+16*18], m6 ;in30 4226 mova [rsp+gprsize+16*31], m1 ;in25 4227 mova [rsp+gprsize+16*30], m3 ;in27 4228 mova [rsp+gprsize+16*27], m5 ;in29 4229 mova [rsp+gprsize+16*34], m7 ;in31 4230 4231 mova m6, m0 ;in24 4232 mova m7, m4 ;in28 4233 mova m0, [coeffq+16*0 ] ;in0 4234 mova m1, [coeffq+16*4 ] ;in4 4235 mova m2, [coeffq+16*1 ] ;in8 4236 mova m3, [coeffq+16*5 ] ;in12 4237 mova m4, [coeffq+16*2 ] ;in16 4238 mova m5, [coeffq+16*6 ] ;in20 4239 call m(idct_8x8_internal_8bpc).main 4240 SAVE_7ROWS rsp+gprsize+16*3 , 16 4241 LOAD_8ROWS rsp+gprsize+16*11, 16 4242 call m(idct_16x8_internal_8bpc).main 4243 mova m7, [rsp+gprsize+16*0] 4244 SAVE_8ROWS rsp+gprsize+16*11, 16 4245 4246 call m(idct_8x32_internal_8bpc).main 4247 4248.pass2: 4249 mov [rsp+gprsize*1+16*35], eobd 4250 lea r3, [dstq+8] 4251 mov [rsp+gprsize*2+16*35], r3 4252 lea r3, [o(m(idct_16x32_internal_8bpc).end)] 4253 jmp m(idct_8x32_internal_8bpc).end 4254 4255.end: 4256 mov dstq, [rsp+gprsize*2+16*35] 4257 mov eobd, [rsp+gprsize*1+16*35] 4258 add coeffq, 16*32 4259 4260 mova m0, [coeffq+16*4 ] ;in1 4261 mova m1, [coeffq+16*12] ;in3 4262 mova m2, [coeffq+16*20] ;in5 4263 mova m3, [coeffq+16*28] ;in7 4264 mova m4, [coeffq+16*5 ] ;in9 4265 mova m5, [coeffq+16*13] ;in11 4266 mova m6, [coeffq+16*21] ;in13 4267 mova m7, [coeffq+16*29] ;in15 4268 4269 mova [rsp+gprsize+16*19], m0 ;in1 4270 mova [rsp+gprsize+16*26], m1 ;in3 4271 mova [rsp+gprsize+16*23], m2 ;in5 4272 mova [rsp+gprsize+16*22], m3 ;in7 4273 mova [rsp+gprsize+16*21], m4 ;in9 4274 mova [rsp+gprsize+16*24], m5 ;in11 4275 mova [rsp+gprsize+16*25], m6 ;in13 4276 mova [rsp+gprsize+16*20], m7 ;in15 4277 4278 mova m0, [coeffq+16*0 ] ;in0 4279 mova m1, [coeffq+16*16] ;in4 4280 mova m2, [coeffq+16*1 ] ;in8 4281 mova m3, [coeffq+16*17] ;in12 4282 4283 cmp eobd, 150 4284 jg .full1 4285 4286 pxor m4, m4 4287 REPX {mova x, m4}, m5, m6, m7 4288 call m(idct_8x8_internal_8bpc).main 4289 SAVE_7ROWS rsp+gprsize+16*3, 16 4290 4291 mova m0, [coeffq+16*8 ] ;in2 4292 mova m1, [coeffq+16*24] ;in6 4293 mova m2, [coeffq+16*9 ] ;in10 4294 mova m3, [coeffq+16*25] ;in14 4295 pxor m4, m4 4296 REPX {mova x, m4}, m5, m6, m7 4297 call m(idct_16x8_internal_8bpc).main 4298 mova m7, [rsp+gprsize+16*0] 4299 SAVE_8ROWS rsp+gprsize+16*11, 16 4300 4301 call m(idct_8x32_internal_8bpc).main_fast 4302 jmp .end1 4303 4304.full1: 4305 mova m4, [coeffq+16*2 ] ;in16 4306 mova m5, [coeffq+16*18] ;in20 4307 mova m6, [coeffq+16*3 ] ;in24 4308 mova m7, [coeffq+16*19] ;in26 4309 call m(idct_8x8_internal_8bpc).main 4310 SAVE_7ROWS rsp+gprsize+16*3, 16 4311 4312 mova m0, [coeffq+16*8 ] ;in2 4313 mova m1, [coeffq+16*24] ;in6 4314 mova m2, [coeffq+16*9 ] ;in10 4315 mova m3, [coeffq+16*25] ;in14 4316 mova m4, [coeffq+16*10] ;in18 4317 mova m5, [coeffq+16*26] ;in22 4318 mova m6, [coeffq+16*11] ;in26 4319 mova m7, [coeffq+16*27] ;in30 4320 call m(idct_16x8_internal_8bpc).main 4321 mova m7, [rsp+gprsize+16*0] 4322 SAVE_8ROWS rsp+gprsize+16*11, 16 4323 4324 mova m0, [coeffq+16*6 ] ;in17 4325 mova m1, [coeffq+16*14] ;in19 4326 mova m2, [coeffq+16*22] ;in21 4327 mova m3, [coeffq+16*30] ;in23 4328 mova m4, [coeffq+16*7 ] ;in25 4329 mova m5, [coeffq+16*15] ;in27 4330 mova m6, [coeffq+16*23] ;in29 4331 mova m7, [coeffq+16*31] ;in31 4332 4333 mova [rsp+gprsize+16*33], m0 ;in17 4334 mova [rsp+gprsize+16*28], m1 ;in19 4335 mova [rsp+gprsize+16*29], m2 ;in21 4336 mova [rsp+gprsize+16*32], m3 ;in23 4337 mova [rsp+gprsize+16*31], m4 ;in25 4338 mova [rsp+gprsize+16*30], m5 ;in27 4339 mova [rsp+gprsize+16*27], m6 ;in29 4340 mova [rsp+gprsize+16*34], m7 ;in31 4341 4342 call m(idct_8x32_internal_8bpc).main 4343 4344.end1: 4345 jmp m(idct_8x32_internal_8bpc).pass2 4346 4347 4348 4349cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 4350%if ARCH_X86_32 4351 LEA r5, $$ 4352%endif 4353 test eobd, eobd 4354 jz .dconly 4355 4356 call m(idct_32x16_internal_8bpc) 4357 call m(idct_8x16_internal_8bpc).pass2 4358 4359 add coeffq, 16*16 4360 lea dstq, [r3+8] 4361 LOAD_8ROWS rsp+16*11, 16 4362 mova [rsp+16*0], m7 4363 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] 4364 call m(idct_8x8_internal_8bpc).pass1_end 4365 call m(idct_8x16_internal_8bpc).pass2 4366 4367 add coeffq, 16*16 4368 lea dstq, [r3+8] 4369 LOAD_8ROWS rsp+16*19, 16 4370 mova [rsp+16*0], m7 4371 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] 4372 call m(idct_8x8_internal_8bpc).pass1_end 4373 call m(idct_8x16_internal_8bpc).pass2 4374 4375 add coeffq, 16*16 4376 lea dstq, [r3+8] 4377 LOAD_8ROWS rsp+16*27, 16 4378 mova [rsp+16*0], m7 4379 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] 4380 call m(idct_8x8_internal_8bpc).pass1_end 4381 call m(idct_8x16_internal_8bpc).pass2 4382 RET 4383 4384.dconly: 4385 movd m1, [o(pw_2896x8)] 4386 pmulhrsw m0, m1, [coeffq] 4387 movd m2, [o(pw_16384)] 4388 mov [coeffq], eobd 4389 pmulhrsw m0, m1 4390 mov r3d, 16 4391 lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] 4392 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body 4393 4394 4395cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 4396 %undef cmp 4397 4398 add coeffq, 16 4399 lea r3, [o(m(idct_32x16_internal_8bpc).pass1_end1)] 4400.pass1: 4401 LOAD_8ROWS coeffq+16*0, 128, 1 4402 call m(idct_8x8_internal_8bpc).main 4403 SAVE_7ROWS rsp+gprsize+16*3, 16 4404 4405 LOAD_8ROWS coeffq+16*4, 128, 1 4406 call m(idct_16x8_internal_8bpc).main 4407 mova m7, [rsp+gprsize+16*0] 4408 SAVE_8ROWS rsp+gprsize+16*11, 16 4409 4410 LOAD_8ROWS coeffq+16*2, 64, 1 4411 mova [rsp+gprsize+16*19], m0 ;in1 4412 mova [rsp+gprsize+16*26], m1 ;in3 4413 mova [rsp+gprsize+16*23], m2 ;in5 4414 mova [rsp+gprsize+16*22], m3 ;in7 4415 mova [rsp+gprsize+16*21], m4 ;in9 4416 mova [rsp+gprsize+16*24], m5 ;in11 4417 mova [rsp+gprsize+16*25], m6 ;in13 4418 mova [rsp+gprsize+16*20], m7 ;in15 4419 4420 LOAD_8ROWS coeffq+16*34, 64, 1 4421 mova [rsp+gprsize+16*33], m0 ;in17 4422 mova [rsp+gprsize+16*28], m1 ;in19 4423 mova [rsp+gprsize+16*29], m2 ;in21 4424 mova [rsp+gprsize+16*32], m3 ;in23 4425 mova [rsp+gprsize+16*31], m4 ;in25 4426 mova [rsp+gprsize+16*30], m5 ;in27 4427 mova [rsp+gprsize+16*27], m6 ;in29 4428 mova [rsp+gprsize+16*34], m7 ;in31 4429 call m(idct_8x32_internal_8bpc).main 4430 4431.pass1_end: 4432 mova [rsp+gprsize+16*0 ], m7 4433 mov tx2q, r3 4434 jmp m(idct_8x8_internal_8bpc).pass1_end 4435 4436.pass1_end1: 4437 SAVE_8ROWS coeffq+16*0, 32 4438 LOAD_8ROWS rsp+gprsize+16*11, 16 4439 mova [rsp+gprsize+16*0 ], m7 4440 lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end2)] 4441 jmp m(idct_8x8_internal_8bpc).pass1_end 4442 4443.pass1_end2: 4444 SAVE_8ROWS coeffq+16*16, 32 4445 LOAD_8ROWS rsp+gprsize+16*19, 16 4446 mova [rsp+gprsize+16*0 ], m7 4447 lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end3)] 4448 jmp m(idct_8x8_internal_8bpc).pass1_end 4449 4450.pass1_end3: 4451 SAVE_8ROWS coeffq+16*32, 32 4452 LOAD_8ROWS rsp+gprsize+16*27, 16 4453 mova [rsp+gprsize+16*0 ], m7 4454 lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end4)] 4455 jmp m(idct_8x8_internal_8bpc).pass1_end 4456 4457.pass1_end4: 4458 SAVE_8ROWS coeffq+16*48, 32 4459 4460 sub coeffq, 16 4461 lea r3, [o(m(idct_32x16_internal_8bpc).end)] 4462 jmp .pass1 4463 4464.end: 4465 ret 4466 4467 4468cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 4469 %undef cmp 4470 4471 mov r4d, eobd 4472 cmp eobd, 43 ;if (eob > 43) 4473 sbb r3d, r3d ; iteration_count++ 4474 cmp r4d, 150 ;if (eob > 150) 4475 sbb r3d, 0 ; iteration_count++ 4476 cmp r4d, 278 ;if (eob > 278) 4477 sbb r3d, -4 ; iteration_count++ 4478 4479%if ARCH_X86_32 4480 LEA r5, $$ 4481%endif 4482 lea r4, [dstq+8] 4483 mov [rsp+16*3], r4 4484 mov [rsp+gprsize+16*3], r3d 4485 mov [rsp+gprsize*2+16*3], coeffq 4486 4487.loop: 4488 LOAD_8ROWS coeffq, 64, 1 4489 mova [rsp+16*1], m6 4490 pxor m6, m6 4491 REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 4492 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] 4493 call m(idct_8x8_internal_8bpc).pass1_end3 4494 mova [rsp+16*0], m2 4495 mova [rsp+16*1], m3 4496 mova [rsp+16*2], m4 4497 mova m3, [o(pw_1697x16)] 4498 mova m4, [o(pw_16384)] 4499 REPX {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1 4500 mova m2, [o(pw_8192)] 4501 REPX {pmulhrsw x, m2}, m5, m6, m7, m0, m1 4502 mova m2, [rsp+16*0] 4503 mova [rsp+16*0], m7 4504 IDTX16 2, 7, 3, 4 4505 mova m7, [rsp+16*2] 4506 mova [rsp+16*2], m5 4507 IDTX16 7, 5, 3, 4 4508 mova m5, [rsp+16*1] 4509 mova [rsp+16*1], m6 4510 pmulhrsw m3, m5 4511 pmulhrsw m3, m4 4512 psrlw m4, 1 ; pw_8192 4513 paddsw m3, m5 4514 pmulhrsw m2, m4 4515 pmulhrsw m3, m4 4516 pmulhrsw m4, m7 4517 call m(idct_8x8_internal_8bpc).end3 4518 lea dstq, [dstq+strideq*2] 4519 add coeffq, 16 4520 dec r3d 4521 jg .loop 4522 mov coeffq, [rsp+gprsize*2+16*3] 4523 add coeffq, 64*8 4524 mov r3d, [rsp+gprsize+16*3] 4525 xor dstq, dstq 4526 mov [rsp+gprsize+16*3], dstq 4527 mov dstq, [rsp+16*3] 4528 test r3d, r3d 4529 jnz .loop 4530 RET 4531 4532 4533cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 4534 %undef cmp 4535 4536 mov r4d, 12 ;0100b 4537 mov r5d, 136 ;1000 1000b 4538 cmp eobd, 44 ;if (eob > 43) 4539 cmovns r4d, r5d ; iteration_count+2 4540 cmp eobd, 151 ;if (eob > 150) 4541 mov r3d, 34952 ;1000 1000 1000 1000b 4542 cmovs r3d, r4d ; iteration_count += 4 4543 4544%if ARCH_X86_32 4545 LEA r5, $$ 4546%endif 4547 lea r4, [dstq+8] 4548 mov [rsp+16*3], r4 4549 4550.loop: 4551 LOAD_8ROWS coeffq, 32, 1 4552 REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 4553 mova [rsp+16*1], m6 4554 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] 4555 call m(idct_8x8_internal_8bpc).pass1_end3 4556 mova [rsp+16*1], m5 4557 mova [rsp+16*2], m6 4558 mova m6, [o(pw_1697x16)] 4559 REPX {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4 4560 pmulhrsw m7, [o(pw_2048)] 4561 mova m5, [rsp+16*1] 4562 mova [rsp+16*0], m7 4563 IDTX16 5, 7, 6 4564 mova m7, [rsp+16*2] 4565 IDTX16 7, 6, 6 4566 mova m6, [o(pw_2048)] 4567 REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 4568 mova [rsp+16*2], m5 4569 mova [rsp+16*1], m7 4570 call m(idct_8x8_internal_8bpc).end3 4571 lea dstq, [dstq+strideq*2] 4572 pxor m7, m7 4573 REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 4574 4575.loop_end: 4576 add coeffq, 16 4577 shr r3d, 2 4578 jz .ret 4579 test r3d, 2 4580 jnz .loop 4581 mov r4d, r3d 4582 and r4d, 1 4583 lea coeffq, [coeffq+r4*8+32*7] 4584 mov dstq, [rsp+16*3] 4585 lea r4, [dstq+8] 4586 mov [rsp+16*3], r4 4587 jmp .loop 4588 4589.ret: 4590 RET 4591 4592 4593cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 4594%if ARCH_X86_32 4595 LEA r5, $$ 4596%endif 4597 test eobd, eobd 4598 jz .dconly 4599 4600 call m(idct_32x32_internal_8bpc) 4601 RET 4602 4603.dconly: 4604 movd m1, [o(pw_2896x8)] 4605 pmulhrsw m0, m1, [coeffq] 4606 movd m2, [o(pw_8192)] 4607 mov [coeffq], eobd 4608 mov r3d, 32 4609 lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] 4610 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body 4611 4612 4613cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 4614 %undef cmp 4615 4616 mov r4d, 2 4617 sub eobd, 136 4618 mov [rsp+gprsize*1+16*35], eobd 4619 mov r3d, 4 4620 cmovs r3d, r4d 4621 4622%if ARCH_X86_32 4623 LEA r5, $$ 4624%endif 4625 4626 mov [rsp+gprsize*2+16*35], coeffq 4627 4628.pass1_loop: 4629 LOAD_8ROWS coeffq+64*1, 64*2 4630 mova [rsp+gprsize+16*19], m0 ;in1 4631 mova [rsp+gprsize+16*26], m1 ;in3 4632 mova [rsp+gprsize+16*23], m2 ;in5 4633 mova [rsp+gprsize+16*22], m3 ;in7 4634 mova [rsp+gprsize+16*21], m4 ;in9 4635 mova [rsp+gprsize+16*24], m5 ;in11 4636 mova [rsp+gprsize+16*25], m6 ;in13 4637 mova [rsp+gprsize+16*20], m7 ;in15 4638 4639 mov tx2d, [rsp+gprsize*1+16*35] 4640 test tx2d, tx2d 4641 jl .fast 4642 4643.full: 4644 LOAD_8ROWS coeffq+64*0, 64*4 4645 call m(idct_8x8_internal_8bpc).main 4646 SAVE_7ROWS rsp+gprsize+16*3, 16 4647 LOAD_8ROWS coeffq+64*2, 64*4 4648 call m(idct_16x8_internal_8bpc).main 4649 mova m7, [rsp+gprsize+16*0] 4650 SAVE_8ROWS rsp+gprsize+16*11, 16 4651 4652 LOAD_8ROWS coeffq+64*17, 64*2 4653 mova [rsp+gprsize+16*33], m0 ;in17 4654 mova [rsp+gprsize+16*28], m1 ;in19 4655 mova [rsp+gprsize+16*29], m2 ;in21 4656 mova [rsp+gprsize+16*32], m3 ;in23 4657 mova [rsp+gprsize+16*31], m4 ;in25 4658 mova [rsp+gprsize+16*30], m5 ;in27 4659 mova [rsp+gprsize+16*27], m6 ;in29 4660 mova [rsp+gprsize+16*34], m7 ;in31 4661 4662 call m(idct_8x32_internal_8bpc).main 4663 jmp .pass1_end 4664 4665.fast: 4666 mova m0, [coeffq+256*0] 4667 mova m1, [coeffq+256*1] 4668 mova m2, [coeffq+256*2] 4669 mova m3, [coeffq+256*3] 4670 pxor m4, m4 4671 REPX {mova x, m4}, m5, m6, m7 4672 call m(idct_8x8_internal_8bpc).main 4673 4674 SAVE_7ROWS rsp+gprsize+16*3, 16 4675 mova m0, [coeffq+128*1] 4676 mova m1, [coeffq+128*3] 4677 mova m2, [coeffq+128*5] 4678 mova m3, [coeffq+128*7] 4679 pxor m4, m4 4680 REPX {mova x, m4}, m5, m6, m7 4681 call m(idct_16x8_internal_8bpc).main 4682 mova m7, [rsp+gprsize+16*0] 4683 SAVE_8ROWS rsp+gprsize+16*11, 16 4684 4685 call m(idct_8x32_internal_8bpc).main_fast 4686 4687.pass1_end: 4688 mova [rsp+gprsize+16*0], m7 4689 mova m7, [o(pw_8192)] 4690 lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end1)] 4691 jmp m(idct_8x8_internal_8bpc).pass1_end1 4692 4693.pass1_end1: 4694 SAVE_8ROWS coeffq+64*0, 64 4695 LOAD_8ROWS rsp+gprsize+16*11, 16 4696 mova [rsp+gprsize+16*0], m7 4697 mova m7, [o(pw_8192)] 4698 lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end2)] 4699 jmp m(idct_8x8_internal_8bpc).pass1_end1 4700 4701.pass1_end2: 4702 SAVE_8ROWS coeffq+64*8, 64 4703 LOAD_8ROWS rsp+gprsize+16*19, 16 4704 mova [rsp+gprsize+16*0], m7 4705 mova m7, [o(pw_8192)] 4706 lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end3)] 4707 jmp m(idct_8x8_internal_8bpc).pass1_end1 4708 4709.pass1_end3: 4710 SAVE_8ROWS coeffq+64*16, 64 4711 LOAD_8ROWS rsp+gprsize+16*27, 16 4712 mova [rsp+gprsize+16*0], m7 4713 mova m7, [o(pw_8192)] 4714 lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end4)] 4715 jmp m(idct_8x8_internal_8bpc).pass1_end1 4716 4717.pass1_end4: 4718 SAVE_8ROWS coeffq+64*24, 64 4719 4720 add coeffq, 16 4721 dec r3d 4722 jg .pass1_loop 4723 4724 4725.pass2: 4726 mov coeffq, [rsp+gprsize*2+16*35] 4727 mov r3d, 4 4728 lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] 4729 4730.pass2_loop: 4731 mov [rsp+gprsize*3+16*35], r3d 4732 lea r3, [dstq+8] 4733 mov [rsp+gprsize*2+16*35], r3 4734 4735 mova m0, [coeffq+16*4 ] 4736 mova m1, [coeffq+16*12] 4737 mova m2, [coeffq+16*20] 4738 mova m3, [coeffq+16*28] 4739 mova m4, [coeffq+16*5 ] 4740 mova m5, [coeffq+16*13] 4741 mova m6, [coeffq+16*21] 4742 mova m7, [coeffq+16*29] 4743 mova [rsp+gprsize+16*19], m0 ;in1 4744 mova [rsp+gprsize+16*26], m1 ;in3 4745 mova [rsp+gprsize+16*23], m2 ;in5 4746 mova [rsp+gprsize+16*22], m3 ;in7 4747 mova [rsp+gprsize+16*21], m4 ;in9 4748 mova [rsp+gprsize+16*24], m5 ;in11 4749 mova [rsp+gprsize+16*25], m6 ;in13 4750 mova [rsp+gprsize+16*20], m7 ;in15 4751 4752 mov eobd, [rsp+gprsize*1+16*35] 4753 test eobd, eobd 4754 jl .fast1 4755 4756.full1: 4757 mova m0, [coeffq+16*0 ] 4758 mova m1, [coeffq+16*16] 4759 mova m2, [coeffq+16*1 ] 4760 mova m3, [coeffq+16*17] 4761 mova m4, [coeffq+16*2 ] 4762 mova m5, [coeffq+16*18] 4763 mova m6, [coeffq+16*3 ] 4764 mova m7, [coeffq+16*19] 4765 call m(idct_8x8_internal_8bpc).main 4766 SAVE_7ROWS rsp+gprsize+16*3, 16 4767 4768 mova m0, [coeffq+16*8 ] 4769 mova m1, [coeffq+16*24] 4770 mova m2, [coeffq+16*9 ] 4771 mova m3, [coeffq+16*25] 4772 mova m4, [coeffq+16*10] 4773 mova m5, [coeffq+16*26] 4774 mova m6, [coeffq+16*11] 4775 mova m7, [coeffq+16*27] 4776 call m(idct_16x8_internal_8bpc).main 4777 mova m7, [rsp+gprsize+16*0] 4778 SAVE_8ROWS rsp+gprsize+16*11, 16 4779 4780 mova m0, [coeffq+16*6 ] 4781 mova m1, [coeffq+16*14] 4782 mova m2, [coeffq+16*22] 4783 mova m3, [coeffq+16*30] 4784 mova m4, [coeffq+16*7 ] 4785 mova m5, [coeffq+16*15] 4786 mova m6, [coeffq+16*23] 4787 mova m7, [coeffq+16*31] 4788 mova [rsp+gprsize+16*33], m0 ;in17 4789 mova [rsp+gprsize+16*28], m1 ;in19 4790 mova [rsp+gprsize+16*29], m2 ;in21 4791 mova [rsp+gprsize+16*32], m3 ;in23 4792 mova [rsp+gprsize+16*31], m4 ;in25 4793 mova [rsp+gprsize+16*30], m5 ;in27 4794 mova [rsp+gprsize+16*27], m6 ;in29 4795 mova [rsp+gprsize+16*34], m7 ;in31 4796 4797 call m(idct_8x32_internal_8bpc).main 4798 jmp tx2q 4799 4800.fast1: 4801 mova m0, [coeffq+16*0 ] 4802 mova m1, [coeffq+16*16] 4803 mova m2, [coeffq+16*1 ] 4804 mova m3, [coeffq+16*17] 4805 pxor m4, m4 4806 REPX {mova x, m4}, m5, m6, m7 4807 call m(idct_8x8_internal_8bpc).main 4808 SAVE_7ROWS rsp+gprsize+16*3, 16 4809 4810 mova m0, [coeffq+16*8 ] 4811 mova m1, [coeffq+16*24] 4812 mova m2, [coeffq+16*9 ] 4813 mova m3, [coeffq+16*25] 4814 pxor m4, m4 4815 REPX {mova x, m4}, m5, m6, m7 4816 call m(idct_16x8_internal_8bpc).main 4817 mova m7, [rsp+gprsize+16*0] 4818 SAVE_8ROWS rsp+gprsize+16*11, 16 4819 4820 call m(idct_8x32_internal_8bpc).main_fast 4821 jmp tx2q 4822 4823.pass2_end: 4824 lea r3, [o(m(idct_32x32_internal_8bpc).pass2_end1)] 4825 jmp m(idct_8x32_internal_8bpc).end 4826 4827.pass2_end1: 4828 lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] 4829 add coeffq, 16*32 4830 mov dstq, [rsp+gprsize*2+16*35] 4831 mov r3d, [rsp+gprsize*3+16*35] 4832 dec r3d 4833 jg .pass2_loop 4834 4835 ret 4836 4837 4838cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 4839 %undef cmp 4840 4841 mov r4d, 2 4842 cmp eobd, 136 4843 mov r3d, 4 4844 cmovs r3d, r4d 4845 4846%if ARCH_X86_32 4847 LEA r5, $$ 4848%endif 4849 4850 lea r4, [dstq+8] 4851 mov [rsp+gprsize*0+16*3], r4 4852 mov [rsp+gprsize*1+16*3], r3d 4853 mov [rsp+gprsize*2+16*3], r3d 4854 mov [rsp+gprsize*3+16*3], coeffq 4855 4856.loop: 4857 LOAD_8ROWS coeffq, 64 4858 mova [rsp+16*1], m6 4859 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] 4860 call m(idct_8x8_internal_8bpc).pass1_end3 4861 pmulhrsw m7, [o(pw_8192)] 4862 mova [rsp+16*0], m7 4863 mova m7, [o(pw_8192)] 4864 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 4865 mova [rsp+16*1], m6 4866 mova [rsp+16*2], m5 4867 call m(idct_8x8_internal_8bpc).end3 4868 lea dstq, [dstq+strideq*2] 4869 4870 pxor m7, m7 4871 REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 4872 4873 add coeffq, 16 4874 dec r3d 4875 jg .loop 4876 4877 mov r4d, [rsp+gprsize*2+16*3] 4878 dec r4d 4879 jle .ret 4880 4881 mov dstq, [rsp+gprsize*0+16*3] 4882 mov coeffq, [rsp+gprsize*3+16*3] 4883 mov [rsp+gprsize*2+16*3], r4 4884 lea r3, [dstq+8] 4885 add coeffq, 64*8 4886 mov [rsp+gprsize*0+16*3], r3 4887 mov r3d, [rsp+gprsize*1+16*3] 4888 mov [rsp+gprsize*3+16*3], coeffq 4889 jmp .loop 4890 4891.ret: 4892 RET 4893 4894 4895cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 4896%if ARCH_X86_32 4897 LEA r5, $$ 4898%endif 4899 test eobd, eobd 4900 jz .dconly 4901 4902 call m(idct_16x64_internal_8bpc) 4903 RET 4904 4905.dconly: 4906 movd m1, [o(pw_2896x8)] 4907 pmulhrsw m0, m1, [coeffq] 4908 movd m2, [o(pw_8192)] 4909 mov [coeffq], eobd 4910 mov r2d, 32 4911 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x64_8bpc).end)] 4912 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly 4913 4914.end: 4915 RET 4916 4917 4918cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 4919 %undef cmp 4920 4921 mov r4d, 2 4922 sub eobd, 151 4923 mov [rsp+gprsize*1+16*67], eobd 4924 mov r3d, 4 4925 cmovs r3d, r4d 4926 4927%if ARCH_X86_32 4928 LEA r5, $$ 4929%endif 4930 4931 mov [rsp+gprsize*2+16*67], coeffq 4932 4933.pass1_loop: 4934 LOAD_8ROWS coeffq+64*0, 64*2 4935 call m(idct_8x8_internal_8bpc).main 4936 SAVE_7ROWS rsp+gprsize+16*3, 16 4937 LOAD_8ROWS coeffq+64*1, 64*2 4938 call m(idct_16x8_internal_8bpc).main 4939 mova m7, [o(pw_8192)] 4940 lea tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end)] 4941 jmp m(idct_8x8_internal_8bpc).pass1_end1 4942 4943.pass1_end: 4944 SAVE_8ROWS coeffq+64*8, 64 4945 LOAD_8ROWS rsp+gprsize+16*3, 16 4946 mova [rsp+gprsize+16*0], m7 4947 mova m7, [o(pw_8192)] 4948 lea tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end1)] 4949 jmp m(idct_8x8_internal_8bpc).pass1_end1 4950 4951.pass1_end1: 4952 SAVE_8ROWS coeffq+64*0, 64 4953 4954 add coeffq, 16 4955 dec r3d 4956 jg .pass1_loop 4957 4958 mov coeffq, [rsp+gprsize*2+16*67] 4959 mov r3d, 2 4960 lea r4, [dstq+8] 4961 mov [rsp+gprsize*2+16*67], r4 4962 lea r4, [o(m(idct_16x64_internal_8bpc).end1)] 4963 4964.pass2_loop: 4965 mov [rsp+gprsize*3+16*67], r3d 4966 mov eobd, [rsp+gprsize*1+16*67] 4967 4968 mova m0, [coeffq+16*4 ] ;in1 4969 mova m1, [coeffq+16*12] ;in3 4970 mova m2, [coeffq+16*20] ;in5 4971 mova m3, [coeffq+16*28] ;in7 4972 mova m4, [coeffq+16*5 ] ;in9 4973 mova m5, [coeffq+16*13] ;in11 4974 mova m6, [coeffq+16*21] ;in13 4975 mova m7, [coeffq+16*29] ;in15 4976 mova [rsp+gprsize+16*35], m0 ;in1 4977 mova [rsp+gprsize+16*49], m1 ;in3 4978 mova [rsp+gprsize+16*43], m2 ;in5 4979 mova [rsp+gprsize+16*41], m3 ;in7 4980 mova [rsp+gprsize+16*39], m4 ;in9 4981 mova [rsp+gprsize+16*45], m5 ;in11 4982 mova [rsp+gprsize+16*47], m6 ;in13 4983 mova [rsp+gprsize+16*37], m7 ;in15 4984 4985 pxor m4, m4 4986 mova m0, [coeffq+16*0] 4987 mova m1, [coeffq+16*1] 4988 4989 test eobd, eobd 4990 jl .fast 4991 4992.full: 4993 mova m2, [coeffq+16*2] 4994 mova m3, [coeffq+16*3] 4995 4996 REPX {mova x, m4}, m5, m6, m7 4997 call m(idct_8x8_internal_8bpc).main 4998 SAVE_7ROWS rsp+gprsize+16*3, 16 4999 5000 pxor m4, m4 5001 mova m0, [coeffq+16*16] 5002 mova m1, [coeffq+16*17] 5003 mova m2, [coeffq+16*18] 5004 mova m3, [coeffq+16*19] 5005 5006 REPX {mova x, m4}, m5, m6, m7 5007 call m(idct_16x8_internal_8bpc).main 5008 mova m7, [rsp+gprsize+16*0] 5009 SAVE_8ROWS rsp+gprsize+16*11, 16 5010 5011 mova m0, [coeffq+16*8 ] 5012 mova m1, [coeffq+16*24] 5013 mova m2, [coeffq+16*9 ] 5014 mova m3, [coeffq+16*25] 5015 mova m4, [coeffq+16*10] 5016 mova m5, [coeffq+16*26] 5017 mova m6, [coeffq+16*11] 5018 mova m7, [coeffq+16*27] 5019 mova [rsp+gprsize+16*19], m0 5020 mova [rsp+gprsize+16*26], m1 5021 mova [rsp+gprsize+16*23], m2 5022 mova [rsp+gprsize+16*22], m3 5023 mova [rsp+gprsize+16*21], m4 5024 mova [rsp+gprsize+16*24], m5 5025 mova [rsp+gprsize+16*25], m6 5026 mova [rsp+gprsize+16*20], m7 5027 5028 call m(idct_8x32_internal_8bpc).main_fast 5029 SAVE_8ROWS rsp+gprsize+16*3, 16 5030 5031 mova m0, [coeffq+16*6 ] ;in17 5032 mova m1, [coeffq+16*14] ;in19 5033 mova m2, [coeffq+16*22] ;in21 5034 mova m3, [coeffq+16*30] ;in23 5035 mova m4, [coeffq+16*7 ] ;in25 5036 mova m5, [coeffq+16*15] ;in27 5037 mova m6, [coeffq+16*23] ;in29 5038 mova m7, [coeffq+16*31] ;in31 5039 mova [rsp+gprsize+16*63], m0 ;in17 5040 mova [rsp+gprsize+16*53], m1 ;in19 5041 mova [rsp+gprsize+16*55], m2 ;in21 5042 mova [rsp+gprsize+16*61], m3 ;in23 5043 mova [rsp+gprsize+16*59], m4 ;in25 5044 mova [rsp+gprsize+16*57], m5 ;in27 5045 mova [rsp+gprsize+16*51], m6 ;in29 5046 mova [rsp+gprsize+16*65], m7 ;in31 5047 5048 call .main 5049 jmp .end 5050 5051.fast: 5052 REPX {mova x, m4}, m2, m3, m5, m6, m7 5053 call m(idct_8x8_internal_8bpc).main 5054 SAVE_7ROWS rsp+gprsize+16*3, 16 5055 5056 pxor m4, m4 5057 mova m0, [coeffq+16*16] 5058 mova m1, [coeffq+16*17] 5059 5060 REPX {mova x, m4}, m2, m3, m5, m6, m7 5061 call m(idct_16x8_internal_8bpc).main 5062 mova m7, [rsp+gprsize+16*0] 5063 SAVE_8ROWS rsp+gprsize+16*11, 16 5064 5065 mova m0, [coeffq+16*8 ] 5066 mova m1, [coeffq+16*24] 5067 mova m2, [coeffq+16*9 ] 5068 mova m3, [coeffq+16*25] 5069 mova [rsp+gprsize+16*19], m0 ;in1 5070 mova [rsp+gprsize+16*26], m1 ;in3 5071 mova [rsp+gprsize+16*23], m2 ;in5 5072 mova [rsp+gprsize+16*22], m3 ;in7 5073 5074 call m(idct_8x32_internal_8bpc).main_veryfast 5075 SAVE_8ROWS rsp+gprsize+16*3, 16 5076 5077 call .main_fast 5078 5079.end: 5080 LOAD_8ROWS rsp+gprsize+16*3, 16 5081 mova [rsp+gprsize+16*0], m7 5082 mov r3, r4 5083 jmp m(idct_8x32_internal_8bpc).end2 5084 5085.end1: 5086 LOAD_8ROWS rsp+gprsize+16*35, 16 5087 lea dstq, [dstq+strideq*2] 5088 add rsp, 16*32 5089 lea r3, [o(m(idct_16x64_internal_8bpc).end2)] 5090 jmp m(idct_8x32_internal_8bpc).end 5091 5092.end2: 5093 add coeffq, 16*32 5094 sub rsp, 16*32 5095 5096 mov dstq, [rsp+gprsize*2+16*67] 5097 mov r3d, [rsp+gprsize*3+16*67] 5098 lea r4, [dstq+8] 5099 mov [rsp+gprsize*2+16*67], r4 5100 lea r4, [o(m(idct_16x64_internal_8bpc).end1)] 5101 5102 dec r3d 5103 jg .pass2_loop 5104 ret 5105 5106 5107ALIGN function_align 5108.main_fast: 5109 mova m0, [rsp+gprsize*2+16*35] ;in1 5110 pmulhrsw m3, m0, [o(pw_4095x8)] ;t62,t63 5111 pmulhrsw m0, [o(pw_101x8)] ;t32,t33 5112 mova m7, [o(pd_2048)] 5113 mova [rsp+gprsize*2+16*35], m0 ;t32 5114 mova [rsp+gprsize*2+16*66], m3 ;t63 5115 ITX_MULSUB_2W 3, 0, 1, 2, 7, 401, 4076 ;t33a, t62a 5116 mova [rsp+gprsize*2+16*36], m3 ;t33a 5117 mova [rsp+gprsize*2+16*65], m0 ;t62a 5118 5119 mova m1, [rsp+gprsize*2+16*37] ;in15 5120 pmulhrsw m2, m1, [o(pw_3822x8)] ;t60,t61 5121 pmulhrsw m1, [o(pw_m1474x8)] ;t34,t35 5122 mova [rsp+gprsize*2+16*38], m1 ;t35 5123 mova [rsp+gprsize*2+16*63], m2 ;t60 5124 ITX_MULSUB_2W 2, 1, 0, 3, 7, m4076, 401 ;t34a, t61a 5125 mova [rsp+gprsize*2+16*37], m2 ;t34a 5126 mova [rsp+gprsize*2+16*64], m1 ;t61a 5127 5128 mova m0, [rsp+gprsize*2+16*39] ;in9 5129 pmulhrsw m3, m0, [o(pw_3996x8)] ;t58,t59 5130 pmulhrsw m0, [o(pw_897x8)] ;t36,t37 5131 mova [rsp+gprsize*2+16*39], m0 ;t36 5132 mova [rsp+gprsize*2+16*62], m3 ;t59 5133 ITX_MULSUB_2W 3, 0, 1, 2, 7, 3166, 2598 ;t37a, t58a 5134 mova [rsp+gprsize*2+16*40], m3 ;t37a 5135 mova [rsp+gprsize*2+16*61], m0 ;t58a 5136 5137 mova m1, [rsp+gprsize*2+16*41] ;in7 5138 pmulhrsw m2, m1, [o(pw_4036x8)] ;t56,t57 5139 pmulhrsw m1, [o(pw_m700x8)] ;t38,t39 5140 mova [rsp+gprsize*2+16*42], m1 ;t39 5141 mova [rsp+gprsize*2+16*59], m2 ;t56 5142 ITX_MULSUB_2W 2, 1, 0, 3, 7, m2598, 3166 ;t38a, t57a 5143 mova [rsp+gprsize*2+16*41], m2 ;t38a 5144 mova [rsp+gprsize*2+16*60], m1 ;t57a 5145 5146 mova m0, [rsp+gprsize*2+16*43] ;in5 5147 pmulhrsw m3, m0, [o(pw_4065x8)] ;t54,t55 5148 pmulhrsw m0, [o(pw_501x8)] ;t40,t41 5149 mova [rsp+gprsize*2+16*43], m0 ;t40 5150 mova [rsp+gprsize*2+16*58], m3 ;t55 5151 ITX_MULSUB_2W 3, 0, 1, 2, 7, 1931, 3612 ;t41a, t54a 5152 mova [rsp+gprsize*2+16*44], m3 ;t41a 5153 mova [rsp+gprsize*2+16*57], m0 ;t54a 5154 5155 mova m1, [rsp+gprsize*2+16*45] ;in11 5156 pmulhrsw m2, m1, [o(pw_3948x8)] ;t52,t53 5157 pmulhrsw m1, [o(pw_m1092x8)] ;t42,t43 5158 mova [rsp+gprsize*2+16*46], m1 ;t43 5159 mova [rsp+gprsize*2+16*55], m2 ;t52 5160 ITX_MULSUB_2W 2, 1, 0, 3, 7, m3612, 1931 ;t42a, t53a 5161 mova [rsp+gprsize*2+16*45], m2 ;t42a 5162 mova [rsp+gprsize*2+16*56], m1 ;t53a 5163 5164 mova m0, [rsp+gprsize*2+16*47] ;in13 5165 pmulhrsw m3, m0, [o(pw_3889x8)] ;t50,t51 5166 pmulhrsw m0, [o(pw_1285x8)] ;t44,t45 5167 mova m6, m0 5168 mova [rsp+gprsize*2+16*54], m3 ;t51 5169 ITX_MULSUB_2W 3, 0, 1, 2, 7, 3920, 1189 ;t45a, t50a 5170 mova [rsp+gprsize*2+16*48], m3 ;t45a 5171 mova [rsp+gprsize*2+16*53], m0 ;t50a 5172 5173 mova m0, [rsp+gprsize*2+16*49] ;in3 5174 pmulhrsw m3, m0, [o(pw_4085x8)] ;t48,t49 5175 pmulhrsw m0, [o(pw_m301x8)] ;t46,t47 5176 mova m4, m3 5177 mova m5, m0 5178 5179 jmp .main2 5180 5181ALIGN function_align 5182.main: 5183 mova m0, [rsp+gprsize*2+16*35] ;in1 5184 mova m1, [rsp+gprsize*2+16*65] ;in31 5185 pmulhrsw m3, m0, [o(pw_4095x8)] ;t63a 5186 pmulhrsw m0, [o(pw_101x8)] ;t32a 5187 pmulhrsw m2, m1, [o(pw_2967x8)] ;t62a 5188 pmulhrsw m1, [o(pw_m2824x8)] ;t33a 5189 mova m7, [o(pd_2048)] 5190 psubsw m4, m0, m1 ;t33 5191 paddsw m0, m1 ;t32 5192 psubsw m5, m3, m2 ;t62 5193 paddsw m3, m2 ;t63 5194 ITX_MULSUB_2W 5, 4, 1, 2, 7, 401, 4076 ;t33a, t62a 5195 mova [rsp+gprsize*2+16*35], m0 ;t32 5196 mova [rsp+gprsize*2+16*36], m5 ;t33a 5197 mova [rsp+gprsize*2+16*65], m4 ;t62a 5198 mova [rsp+gprsize*2+16*66], m3 ;t63 5199 5200 mova m0, [rsp+gprsize*2+16*63] ;in17 5201 mova m1, [rsp+gprsize*2+16*37] ;in15 5202 pmulhrsw m3, m0, [o(pw_3745x8)] ;t61a 5203 pmulhrsw m0, [o(pw_1660x8)] ;t34a 5204 pmulhrsw m2, m1, [o(pw_3822x8)] ;t60a 5205 pmulhrsw m1, [o(pw_m1474x8)] ;t35a 5206 psubsw m4, m1, m0 ;t34 5207 paddsw m0, m1 ;t35 5208 psubsw m5, m2, m3 ;t61 5209 paddsw m3, m2 ;t60 5210 ITX_MULSUB_2W 5, 4, 1, 2, 7, m4076, 401 ;t34a, t61a 5211 mova [rsp+gprsize*2+16*37], m5 ;t34a 5212 mova [rsp+gprsize*2+16*38], m0 ;t35 5213 mova [rsp+gprsize*2+16*63], m3 ;t60 5214 mova [rsp+gprsize*2+16*64], m4 ;t61a 5215 5216 mova m0, [rsp+gprsize*2+16*39] ;in9 5217 mova m1, [rsp+gprsize*2+16*61] ;in23 5218 pmulhrsw m3, m0, [o(pw_3996x8)] ;t59a 5219 pmulhrsw m0, [o(pw_897x8)] ;t36a 5220 pmulhrsw m2, m1, [o(pw_3461x8)] ;t58a 5221 pmulhrsw m1, [o(pw_m2191x8)] ;t37a 5222 psubsw m4, m0, m1 ;t37 5223 paddsw m0, m1 ;t36 5224 psubsw m5, m3, m2 ;t58 5225 paddsw m3, m2 ;t59 5226 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3166, 2598 ;t37a, t58a 5227 mova [rsp+gprsize*2+16*39], m0 ;t36 5228 mova [rsp+gprsize*2+16*40], m5 ;t37a 5229 mova [rsp+gprsize*2+16*61], m4 ;t58a 5230 mova [rsp+gprsize*2+16*62], m3 ;t59 5231 5232 mova m0, [rsp+gprsize*2+16*59] ;in25 5233 mova m1, [rsp+gprsize*2+16*41] ;in7 5234 pmulhrsw m3, m0, [o(pw_3349x8)] ;t57a 5235 pmulhrsw m0, [o(pw_2359x8)] ;t38a 5236 pmulhrsw m2, m1, [o(pw_4036x8)] ;t56a 5237 pmulhrsw m1, [o(pw_m700x8)] ;t39a 5238 psubsw m4, m1, m0 ;t38 5239 paddsw m0, m1 ;t39 5240 psubsw m5, m2, m3 ;t57 5241 paddsw m3, m2 ;t56 5242 ITX_MULSUB_2W 5, 4, 1, 2, 7, m2598, 3166 ;t38a, t57a 5243 mova [rsp+gprsize*2+16*41], m5 ;t38a 5244 mova [rsp+gprsize*2+16*42], m0 ;t39 5245 mova [rsp+gprsize*2+16*59], m3 ;t56 5246 mova [rsp+gprsize*2+16*60], m4 ;t57a 5247 5248 mova m0, [rsp+gprsize*2+16*43] ;in5 5249 mova m1, [rsp+gprsize*2+16*57] ;in27 5250 pmulhrsw m3, m0, [o(pw_4065x8)] ;t55a 5251 pmulhrsw m0, [o(pw_501x8)] ;t40a 5252 pmulhrsw m2, m1, [o(pw_3229x8)] ;t54a 5253 pmulhrsw m1, [o(pw_m2520x8)] ;t41a 5254 psubsw m4, m0, m1 ;t41 5255 paddsw m0, m1 ;t40 5256 psubsw m5, m3, m2 ;t54 5257 paddsw m3, m2 ;t55 5258 ITX_MULSUB_2W 5, 4, 1, 2, 7, 1931, 3612 ;t41a, t54a 5259 mova [rsp+gprsize*2+16*43], m0 ;t40 5260 mova [rsp+gprsize*2+16*44], m5 ;t41a 5261 mova [rsp+gprsize*2+16*57], m4 ;t54a 5262 mova [rsp+gprsize*2+16*58], m3 ;t55 5263 5264 mova m0, [rsp+gprsize*2+16*55] ;in21 5265 mova m1, [rsp+gprsize*2+16*45] ;in11 5266 pmulhrsw m3, m0, [o(pw_3564x8)] ;t53a 5267 pmulhrsw m0, [o(pw_2019x8)] ;t42a 5268 pmulhrsw m2, m1, [o(pw_3948x8)] ;t52a 5269 pmulhrsw m1, [o(pw_m1092x8)] ;t43a 5270 psubsw m4, m1, m0 ;t42 5271 paddsw m0, m1 ;t43 5272 psubsw m5, m2, m3 ;t53 5273 paddsw m3, m2 ;t52 5274 ITX_MULSUB_2W 5, 4, 1, 2, 7, m3612, 1931 ;t42a, t53a 5275 mova [rsp+gprsize*2+16*45], m5 ;t42a 5276 mova [rsp+gprsize*2+16*46], m0 ;t43 5277 mova [rsp+gprsize*2+16*55], m3 ;t52 5278 mova [rsp+gprsize*2+16*56], m4 ;t53a 5279 5280 mova m0, [rsp+gprsize*2+16*47] ;in13 5281 mova m1, [rsp+gprsize*2+16*53] ;in19 5282 pmulhrsw m3, m0, [o(pw_3889x8)] ;t51a 5283 pmulhrsw m0, [o(pw_1285x8)] ;t44a 5284 pmulhrsw m2, m1, [o(pw_3659x8)] ;t50a 5285 pmulhrsw m1, [o(pw_m1842x8)] ;t45a 5286 psubsw m4, m0, m1 ;t45 5287 paddsw m0, m1 ;t44 5288 psubsw m5, m3, m2 ;t50 5289 paddsw m3, m2 ;t51 5290 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3920, 1189 ;t45a, t50a 5291 mova m6, m0 5292 mova [rsp+gprsize*2+16*48], m5 ;t45a 5293 mova [rsp+gprsize*2+16*53], m4 ;t50a 5294 mova [rsp+gprsize*2+16*54], m3 ;t51 5295 5296 mova m0, [rsp+gprsize*2+16*51] ;in29 5297 mova m1, [rsp+gprsize*2+16*49] ;in3 5298 pmulhrsw m3, m0, [o(pw_3102x8)] ;t49a 5299 pmulhrsw m0, [o(pw_2675x8)] ;t46a 5300 pmulhrsw m2, m1, [o(pw_4085x8)] ;t48a 5301 pmulhrsw m1, [o(pw_m301x8)] ;t47a 5302 psubsw m5, m1, m0 ;t46 5303 paddsw m0, m1 ;t47 5304 psubsw m4, m2, m3 ;t49 5305 paddsw m3, m2 ;t48 5306 5307ALIGN function_align 5308.main2: 5309 ITX_MULSUB_2W 4, 5, 1, 2, 7, m1189, 3920 ;t46a, t49a 5310 mova m1, [rsp+gprsize*2+16*54] ;t51 5311 psubsw m2, m0, m6 ;t44a 5312 paddsw m0, m6 ;t47a 5313 psubsw m6, m3, m1 ;t51a 5314 paddsw m3, m1 ;t48a 5315 mova [rsp+gprsize*2+16*50], m0 ;t47a 5316 mova [rsp+gprsize*2+16*51], m3 ;t48a 5317 ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t44, t51 5318 mova [rsp+gprsize*2+16*47], m6 ;t44 5319 mova [rsp+gprsize*2+16*54], m2 ;t51 5320 5321 mova m0, [rsp+gprsize*2+16*48] ;t45a 5322 mova m3, [rsp+gprsize*2+16*53] ;t50a 5323 psubsw m2, m4, m0 ;t45 5324 paddsw m4, m0 ;t46 5325 psubsw m6, m5, m3 ;t50 5326 paddsw m5, m3 ;t49 5327 ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t45a, t50a 5328 mova [rsp+gprsize*2+16*48], m6 ;t45a 5329 mova [rsp+gprsize*2+16*49], m4 ;t46 5330 mova [rsp+gprsize*2+16*52], m5 ;t49 5331 mova [rsp+gprsize*2+16*53], m2 ;t50a 5332 5333 mova m0, [rsp+gprsize*2+16*43] ;t40 5334 mova m2, [rsp+gprsize*2+16*46] ;t43 5335 mova m3, [rsp+gprsize*2+16*55] ;t52 5336 mova m1, [rsp+gprsize*2+16*58] ;t55 5337 psubsw m4, m0, m2 ;t43a 5338 paddsw m0, m2 ;t40a 5339 psubsw m5, m1, m3 ;t52a 5340 paddsw m1, m3 ;t55a 5341 ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t43, t52 5342 mova [rsp+gprsize*2+16*43], m0 ;t40a 5343 mova [rsp+gprsize*2+16*46], m5 ;t43 5344 mova [rsp+gprsize*2+16*55], m4 ;t52 5345 mova [rsp+gprsize*2+16*58], m1 ;t55a 5346 5347 mova m0, [rsp+gprsize*2+16*44] ;t41a 5348 mova m2, [rsp+gprsize*2+16*45] ;t42a 5349 mova m3, [rsp+gprsize*2+16*56] ;t53a 5350 mova m1, [rsp+gprsize*2+16*57] ;t54a 5351 psubsw m4, m0, m2 ;t42 5352 paddsw m0, m2 ;t41 5353 psubsw m5, m1, m3 ;t53 5354 paddsw m1, m3 ;t54 5355 ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t42a, t53a 5356 mova [rsp+gprsize*2+16*44], m0 ;t41 5357 mova [rsp+gprsize*2+16*45], m5 ;t42a 5358 mova [rsp+gprsize*2+16*56], m4 ;t53a 5359 mova [rsp+gprsize*2+16*57], m1 ;t54 5360 5361 mova m0, [rsp+gprsize*2+16*41] ;t38a 5362 mova m2, [rsp+gprsize*2+16*40] ;t37a 5363 mova m3, [rsp+gprsize*2+16*61] ;t58a 5364 mova m1, [rsp+gprsize*2+16*60] ;t57a 5365 psubsw m4, m0, m2 ;t37 5366 paddsw m0, m2 ;t38 5367 psubsw m5, m1, m3 ;t58 5368 paddsw m1, m3 ;t57 5369 ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t37a, t58a 5370 mova [rsp+gprsize*2+16*41], m0 ;t38 5371 mova [rsp+gprsize*2+16*40], m5 ;t37a 5372 mova [rsp+gprsize*2+16*61], m4 ;t58a 5373 mova [rsp+gprsize*2+16*60], m1 ;t57 5374 5375 mova m0, [rsp+gprsize*2+16*42] ;t39 5376 mova m2, [rsp+gprsize*2+16*39] ;t36 5377 mova m3, [rsp+gprsize*2+16*62] ;t59 5378 mova m1, [rsp+gprsize*2+16*59] ;t56 5379 psubsw m4, m0, m2 ;t36a 5380 paddsw m0, m2 ;t39a 5381 psubsw m5, m1, m3 ;t59a 5382 paddsw m1, m3 ;t56a 5383 ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t36, t59 5384 mova [rsp+gprsize*2+16*42], m0 ;t39a 5385 mova [rsp+gprsize*2+16*39], m5 ;t36 5386 mova [rsp+gprsize*2+16*62], m4 ;t59 5387 mova [rsp+gprsize*2+16*59], m1 ;t56a 5388 5389 mova m0, [rsp+gprsize*2+16*35] ;t32 5390 mova m2, [rsp+gprsize*2+16*38] ;t35 5391 mova m3, [rsp+gprsize*2+16*63] ;t60 5392 mova m1, [rsp+gprsize*2+16*66] ;t63 5393 psubsw m4, m0, m2 ;t35a 5394 paddsw m0, m2 ;t32a 5395 psubsw m5, m1, m3 ;t60a 5396 paddsw m1, m3 ;t63a 5397 ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t35, t60 5398 mova [rsp+gprsize*2+16*35], m0 ;t32a 5399 mova [rsp+gprsize*2+16*38], m5 ;t35 5400 mova [rsp+gprsize*2+16*63], m4 ;t60 5401 mova [rsp+gprsize*2+16*66], m1 ;t63a 5402 5403 mova m0, [rsp+gprsize*2+16*36] ;t33a 5404 mova m2, [rsp+gprsize*2+16*37] ;t34a 5405 mova m3, [rsp+gprsize*2+16*64] ;t61a 5406 mova m1, [rsp+gprsize*2+16*65] ;t62a 5407 psubsw m4, m0, m2 ;t34 5408 paddsw m0, m2 ;t33 5409 psubsw m5, m1, m3 ;t61 5410 paddsw m1, m3 ;t62 5411 ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t34a, t61a 5412 5413 mova m2, [rsp+gprsize*2+16*41] ;t38 5414 mova m3, [rsp+gprsize*2+16*60] ;t57 5415 psubsw m6, m0, m2 ;t38a 5416 paddsw m0, m2 ;t33a 5417 psubsw m2, m1, m3 ;t57a 5418 paddsw m1, m3 ;t62a 5419 mova [rsp+gprsize*2+16*36], m0 ;t33a 5420 mova [rsp+gprsize*2+16*65], m1 ;t62a 5421 ITX_MULSUB_2W 2, 6, 0, 3, 7, 1567, 3784 ;t38, t57 5422 mova [rsp+gprsize*2+16*41], m2 ;t38 5423 mova [rsp+gprsize*2+16*60], m6 ;t57 5424 5425 mova m2, [rsp+gprsize*2+16*40] ;t37 5426 mova m3, [rsp+gprsize*2+16*61] ;t58 5427 psubsw m0, m5, m2 ;t37 5428 paddsw m5, m2 ;t34 5429 psubsw m1, m4, m3 ;t58 5430 paddsw m4, m3 ;t61 5431 ITX_MULSUB_2W 1, 0, 2, 3, 7, 1567, 3784 ;t37a, t58a 5432 mova [rsp+gprsize*2+16*37], m5 ;t34 5433 mova [rsp+gprsize*2+16*64], m4 ;t61 5434 mova [rsp+gprsize*2+16*40], m1 ;t37a 5435 mova [rsp+gprsize*2+16*61], m0 ;t58a 5436 5437 mova m0, [rsp+gprsize*2+16*38] ;t35 5438 mova m2, [rsp+gprsize*2+16*39] ;t36 5439 mova m3, [rsp+gprsize*2+16*62] ;t59 5440 mova m1, [rsp+gprsize*2+16*63] ;t60 5441 psubsw m4, m0, m2 ;t36a 5442 paddsw m0, m2 ;t35a 5443 psubsw m5, m1, m3 ;t59a 5444 paddsw m1, m3 ;t60a 5445 ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t36, t59 5446 mova [rsp+gprsize*2+16*38], m0 ;t35a 5447 mova [rsp+gprsize*2+16*39], m5 ;t36 5448 mova [rsp+gprsize*2+16*62], m4 ;t59 5449 mova [rsp+gprsize*2+16*63], m1 ;t60a 5450 5451 mova m0, [rsp+gprsize*2+16*35] ;t32a 5452 mova m2, [rsp+gprsize*2+16*42] ;t39a 5453 mova m3, [rsp+gprsize*2+16*59] ;t56a 5454 mova m1, [rsp+gprsize*2+16*66] ;t63a 5455 psubsw m4, m0, m2 ;t39 5456 paddsw m0, m2 ;t32 5457 psubsw m5, m1, m3 ;t56 5458 paddsw m1, m3 ;t63 5459 ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t39a, t56a 5460 mova [rsp+gprsize*2+16*35], m0 ;t32 5461 mova [rsp+gprsize*2+16*42], m5 ;t39a 5462 mova [rsp+gprsize*2+16*59], m4 ;t56a 5463 mova [rsp+gprsize*2+16*66], m1 ;t63 5464 5465 mova m0, [rsp+gprsize*2+16*50] ;t47a 5466 mova m2, [rsp+gprsize*2+16*43] ;t40a 5467 mova m3, [rsp+gprsize*2+16*58] ;t55a 5468 mova m1, [rsp+gprsize*2+16*51] ;t48a 5469 psubsw m4, m0, m2 ;t40 5470 paddsw m0, m2 ;t47 5471 psubsw m5, m1, m3 ;t55 5472 paddsw m1, m3 ;t48 5473 ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t40a, t55a 5474 mova [rsp+gprsize*2+16*50], m0 ;t47 5475 mova [rsp+gprsize*2+16*43], m5 ;t40a 5476 mova [rsp+gprsize*2+16*58], m4 ;t55a 5477 mova [rsp+gprsize*2+16*51], m1 ;t48 5478 5479 mova m0, [rsp+gprsize*2+16*49] ;t46 5480 mova m2, [rsp+gprsize*2+16*44] ;t41 5481 mova m3, [rsp+gprsize*2+16*57] ;t54 5482 mova m1, [rsp+gprsize*2+16*52] ;t49 5483 psubsw m4, m0, m2 ;t41a 5484 paddsw m0, m2 ;t46a 5485 psubsw m5, m1, m3 ;t54a 5486 paddsw m1, m3 ;t49a 5487 ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t41, t54 5488 mova [rsp+gprsize*2+16*49], m0 ;t46a 5489 mova [rsp+gprsize*2+16*44], m5 ;t41 5490 mova [rsp+gprsize*2+16*57], m4 ;t54 5491 mova [rsp+gprsize*2+16*52], m1 ;t49a 5492 5493 mova m0, [rsp+gprsize*2+16*48] ;t45a 5494 mova m2, [rsp+gprsize*2+16*45] ;t42a 5495 mova m3, [rsp+gprsize*2+16*56] ;t53a 5496 mova m1, [rsp+gprsize*2+16*53] ;t50a 5497 psubsw m4, m0, m2 ;t42 5498 paddsw m0, m2 ;t45 5499 psubsw m5, m1, m3 ;t53 5500 paddsw m1, m3 ;t50 5501 ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t42a, t53a 5502 mova [rsp+gprsize*2+16*48], m0 ;t45 5503 mova [rsp+gprsize*2+16*45], m5 ;t42a 5504 mova [rsp+gprsize*2+16*56], m4 ;t53a 5505 mova [rsp+gprsize*2+16*53], m1 ;t50 5506 5507 mova m0, [rsp+gprsize*2+16*47] ;t44 5508 mova m2, [rsp+gprsize*2+16*46] ;t43 5509 mova m3, [rsp+gprsize*2+16*55] ;t52 5510 mova m1, [rsp+gprsize*2+16*54] ;t51 5511 psubsw m4, m0, m2 ;t43a 5512 paddsw m0, m2 ;t44a 5513 psubsw m5, m1, m3 ;t52a 5514 paddsw m1, m3 ;t51a 5515 ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t43, t52 5516 5517 mova m2, [rsp+gprsize*2+16*38] ;t35a 5518 mova m3, [rsp+gprsize*2+16*31] ;tmp[28] 5519 psubsw m6, m2, m0 ;t44 5520 paddsw m2, m0 ;t35 5521 psubsw m0, m3, m2 ;out35 5522 paddsw m2, m3 ;out28 5523 mova m3, [rsp+gprsize*2+16*63] ;t60a 5524 mova [rsp+gprsize*2+16*38], m0 ;out35 5525 mova [rsp+gprsize*2+16*31], m2 ;out28 5526 psubsw m0, m3, m1 ;t51 5527 paddsw m3, m1 ;t60 5528 ITX_MULSUB_2W 0, 6, 1, 2, 7, 2896, 2896 ;t44a, t51a 5529 mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3] 5530 psubsw m1, m2, m3 ;out60 5531 paddsw m2, m3 ;out3 5532 mova m3, [rsp+gprsize*2+16*22] ;tmp[19] 5533 mova [rsp+gprsize*2+16*63], m1 ;out60 5534 mova [rsp+gprsize*2+16*6 ], m2 ;out3 5535 psubsw m1, m3, m0 ;out44 5536 paddsw m3, m0 ;out19 5537 mova m2, [rsp+gprsize*2+16*15] ;tmp[12] 5538 5539 mova m0, [rsp+gprsize*2+16*39] ;t36 5540 mova [rsp+gprsize*2+16*47], m1 ;out44 5541 mova [rsp+gprsize*2+16*22], m3 ;out19 5542 mova m1, [rsp+gprsize*2+16*62] ;t59 5543 psubsw m3, m2, m6 ;out51 5544 paddsw m2, m6 ;out12 5545 mova [rsp+gprsize*2+16*54], m3 ;out51 5546 mova [rsp+gprsize*2+16*15], m2 ;out12 5547 psubsw m2, m0, m5 ;t43a 5548 paddsw m0, m5 ;t36a 5549 mova m5, [rsp+gprsize*2+16*30] ;tmp[27] 5550 psubsw m3, m1, m4 ;t52a 5551 paddsw m1, m4 ;t59a 5552 ITX_MULSUB_2W 3, 2, 4, 6, 7, 2896, 2896 ;t43, t52 5553 mova m4, [rsp+gprsize*2+16*7 ] ;tmp[4 ] 5554 psubsw m6, m5, m0 ;out36 5555 paddsw m5, m0 ;out27 5556 psubsw m0, m4, m1 ;out59 5557 paddsw m4, m1 ;out4 5558 mova [rsp+gprsize*2+16*39], m6 ;out36 5559 mova [rsp+gprsize*2+16*30], m5 ;out27 5560 mova [rsp+gprsize*2+16*62], m0 ;out59 5561 mova [rsp+gprsize*2+16*7 ], m4 ;out4 5562 mova m0, [rsp+gprsize*2+16*23] ;tmp[20] 5563 mova m5, [rsp+gprsize*2+16*14] ;tmp[11] 5564 psubsw m4, m0, m3 ;out43 5565 paddsw m0, m3 ;out20 5566 psubsw m6, m5, m2 ;out52 5567 paddsw m5, m2 ;out11 5568 mova [rsp+gprsize*2+16*46], m4 ;out43 5569 mova [rsp+gprsize*2+16*23], m0 ;out20 5570 mova [rsp+gprsize*2+16*55], m6 ;out52 5571 mova [rsp+gprsize*2+16*14], m5 ;out11 5572 5573 mova m0, [rsp+gprsize*2+16*40] ;t37a 5574 mova m5, [rsp+gprsize*2+16*45] ;t42a 5575 mova m3, [rsp+gprsize*2+16*56] ;t53a 5576 mova m1, [rsp+gprsize*2+16*61] ;t58a 5577 mova m2, [rsp+gprsize*2+16*29] ;tmp[26] 5578 psubsw m4, m0, m5 ;t42 5579 paddsw m0, m5 ;t37 5580 psubsw m5, m1, m3 ;t53 5581 paddsw m1, m3 ;t58 5582 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t43, t52 5583 mova m3, [rsp+gprsize*2+16*8 ] ;tmp[5 ] 5584 psubsw m6, m2, m0 ;out37 5585 paddsw m2, m0 ;out26 5586 psubsw m0, m3, m1 ;out58 5587 paddsw m3, m1 ;out5 5588 mova [rsp+gprsize*2+16*40], m6 ;out37 5589 mova [rsp+gprsize*2+16*29], m2 ;out26 5590 mova [rsp+gprsize*2+16*61], m0 ;out58 5591 mova [rsp+gprsize*2+16*8 ], m3 ;out5 5592 mova m0, [rsp+gprsize*2+16*24] ;tmp[21] 5593 mova m1, [rsp+gprsize*2+16*13] ;tmp[10] 5594 psubsw m2, m0, m5 ;out42 5595 paddsw m0, m5 ;out21 5596 psubsw m3, m1, m4 ;out53 5597 paddsw m1, m4 ;out10 5598 mova [rsp+gprsize*2+16*45], m2 ;out42 5599 mova [rsp+gprsize*2+16*24], m0 ;out21 5600 mova [rsp+gprsize*2+16*56], m3 ;out53 5601 mova [rsp+gprsize*2+16*13], m1 ;out10 5602 5603 mova m0, [rsp+gprsize*2+16*41] ;t38 5604 mova m5, [rsp+gprsize*2+16*44] ;t41 5605 mova m3, [rsp+gprsize*2+16*57] ;t54 5606 mova m1, [rsp+gprsize*2+16*60] ;t57 5607 mova m2, [rsp+gprsize*2+16*28] ;tmp[25] 5608 psubsw m4, m0, m5 ;t41a 5609 paddsw m0, m5 ;t38a 5610 psubsw m5, m1, m3 ;t54a 5611 paddsw m1, m3 ;t57a 5612 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t41a, t54a 5613 mova m3, [rsp+gprsize*2+16*9 ] ;tmp[6 ] 5614 psubsw m6, m2, m0 ;out38 5615 paddsw m2, m0 ;out25 5616 psubsw m0, m3, m1 ;out57 5617 paddsw m3, m1 ;out6 5618 mova [rsp+gprsize*2+16*41], m6 ;out38 5619 mova [rsp+gprsize*2+16*28], m2 ;out25 5620 mova [rsp+gprsize*2+16*60], m0 ;out57 5621 mova [rsp+gprsize*2+16*9 ], m3 ;out6 5622 mova m0, [rsp+gprsize*2+16*25] ;tmp[22] 5623 mova m1, [rsp+gprsize*2+16*12] ;tmp[9 ] 5624 psubsw m2, m0, m5 ;out41 5625 paddsw m0, m5 ;out22 5626 psubsw m3, m1, m4 ;out54 5627 paddsw m1, m4 ;out9 5628 mova [rsp+gprsize*2+16*44], m2 ;out41 5629 mova [rsp+gprsize*2+16*25], m0 ;out22 5630 mova [rsp+gprsize*2+16*57], m3 ;out54 5631 mova [rsp+gprsize*2+16*12], m1 ;out9 5632 5633 mova m0, [rsp+gprsize*2+16*42] ;t39a 5634 mova m5, [rsp+gprsize*2+16*43] ;t40a 5635 mova m3, [rsp+gprsize*2+16*58] ;t55a 5636 mova m1, [rsp+gprsize*2+16*59] ;t56a 5637 mova m2, [rsp+gprsize*2+16*27] ;tmp[24] 5638 psubsw m4, m0, m5 ;t40 5639 paddsw m0, m5 ;t39 5640 psubsw m5, m1, m3 ;t55 5641 paddsw m1, m3 ;t56 5642 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t40a, t55a 5643 mova m3, [rsp+gprsize*2+16*10] ;tmp[7 ] 5644 psubsw m6, m2, m0 ;out39 5645 paddsw m2, m0 ;out24 5646 psubsw m0, m3, m1 ;out56 5647 paddsw m3, m1 ;out7 5648 mova [rsp+gprsize*2+16*42], m6 ;out39 5649 mova [rsp+gprsize*2+16*27], m2 ;out24 5650 mova [rsp+gprsize*2+16*59], m0 ;out56 5651 mova [rsp+gprsize*2+16*10], m3 ;out7 5652 mova m0, [rsp+gprsize*2+16*26] ;tmp[23] 5653 mova m1, [rsp+gprsize*2+16*11] ;tmp[8 ] 5654 psubsw m2, m0, m5 ;out40 5655 paddsw m0, m5 ;out23 5656 psubsw m3, m1, m4 ;out55 5657 paddsw m1, m4 ;out8 5658 mova [rsp+gprsize*2+16*43], m2 ;out40 5659 mova [rsp+gprsize*2+16*26], m0 ;out23 5660 mova [rsp+gprsize*2+16*58], m3 ;out55 5661 mova [rsp+gprsize*2+16*11], m1 ;out8 5662 5663 mova m0, [rsp+gprsize*2+16*37] ;t34 5664 mova m5, [rsp+gprsize*2+16*48] ;t45 5665 mova m3, [rsp+gprsize*2+16*53] ;t50 5666 mova m1, [rsp+gprsize*2+16*64] ;t61 5667 mova m2, [rsp+gprsize*2+16*32] ;tmp[29] 5668 psubsw m4, m0, m5 ;t45a 5669 paddsw m0, m5 ;t34a 5670 psubsw m5, m1, m3 ;t50a 5671 paddsw m1, m3 ;t61a 5672 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 5673 mova m3, [rsp+gprsize*2+16*5 ] ;tmp[2 ] 5674 psubsw m6, m2, m0 ;out34 5675 paddsw m2, m0 ;out29 5676 psubsw m0, m3, m1 ;out61 5677 paddsw m3, m1 ;out2 5678 mova [rsp+gprsize*2+16*37], m6 ;out34 5679 mova [rsp+gprsize*2+16*32], m2 ;out29 5680 mova [rsp+gprsize*2+16*64], m0 ;out61 5681 mova [rsp+gprsize*2+16*5 ], m3 ;out2 5682 mova m0, [rsp+gprsize*2+16*21] ;tmp[18] 5683 mova m1, [rsp+gprsize*2+16*16] ;tmp[13] 5684 psubsw m2, m0, m5 ;out45 5685 paddsw m0, m5 ;out18 5686 psubsw m3, m1, m4 ;out50 5687 paddsw m1, m4 ;out13 5688 mova [rsp+gprsize*2+16*48], m2 ;out45 5689 mova [rsp+gprsize*2+16*21], m0 ;out18 5690 mova [rsp+gprsize*2+16*53], m3 ;out50 5691 mova [rsp+gprsize*2+16*16], m1 ;out13 5692 5693 mova m0, [rsp+gprsize*2+16*36] ;t33a 5694 mova m5, [rsp+gprsize*2+16*49] ;t46a 5695 mova m3, [rsp+gprsize*2+16*52] ;t49a 5696 mova m1, [rsp+gprsize*2+16*65] ;t62a 5697 mova m2, [rsp+gprsize*2+16*33] ;tmp[30] 5698 psubsw m4, m0, m5 ;t46 5699 paddsw m0, m5 ;t33 5700 psubsw m5, m1, m3 ;t49 5701 paddsw m1, m3 ;t62 5702 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 5703 mova m3, [rsp+gprsize*2+16*4 ] ;tmp[1 ] 5704 psubsw m6, m2, m0 ;out33 5705 paddsw m2, m0 ;out30 5706 psubsw m0, m3, m1 ;out62 5707 paddsw m3, m1 ;out1 5708 mova [rsp+gprsize*2+16*36], m6 ;out33 5709 mova [rsp+gprsize*2+16*33], m2 ;out30 5710 mova [rsp+gprsize*2+16*65], m0 ;out62 5711 mova [rsp+gprsize*2+16*4 ], m3 ;out1 5712 mova m0, [rsp+gprsize*2+16*20] ;tmp[17] 5713 mova m1, [rsp+gprsize*2+16*17] ;tmp[14] 5714 psubsw m2, m0, m5 ;out46 5715 paddsw m0, m5 ;out17 5716 psubsw m3, m1, m4 ;out49 5717 paddsw m1, m4 ;out14 5718 mova [rsp+gprsize*2+16*49], m2 ;out46 5719 mova [rsp+gprsize*2+16*20], m0 ;out17 5720 mova [rsp+gprsize*2+16*52], m3 ;out49 5721 mova [rsp+gprsize*2+16*17], m1 ;out14 5722 5723 mova m0, [rsp+gprsize*2+16*35] ;t32 5724 mova m5, [rsp+gprsize*2+16*50] ;t47 5725 mova m3, [rsp+gprsize*2+16*51] ;t48 5726 mova m1, [rsp+gprsize*2+16*66] ;t63 5727 mova m2, [rsp+gprsize*2+16*34] ;tmp[31] 5728 psubsw m4, m0, m5 ;t47a 5729 paddsw m0, m5 ;t32a 5730 psubsw m5, m1, m3 ;t48a 5731 paddsw m1, m3 ;t63a 5732 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t47, t48 5733 mova m3, [rsp+gprsize*2+16*3 ] ;tmp[0 ] 5734 psubsw m6, m2, m0 ;out32 5735 paddsw m2, m0 ;out31 5736 psubsw m0, m3, m1 ;out63 5737 paddsw m3, m1 ;out0 5738 mova [rsp+gprsize*2+16*35], m6 ;out32 5739 mova [rsp+gprsize*2+16*34], m2 ;out31 5740 mova [rsp+gprsize*2+16*66], m0 ;out63 5741 mova [rsp+gprsize*2+16*3 ], m3 ;out0 5742 mova m0, [rsp+gprsize*2+16*19] ;tmp[16] 5743 mova m1, [rsp+gprsize*2+16*18] ;tmp[15] 5744 psubsw m2, m0, m5 ;out47 5745 paddsw m0, m5 ;out16 5746 psubsw m3, m1, m4 ;out48 5747 paddsw m1, m4 ;out15 5748 mova [rsp+gprsize*2+16*50], m2 ;out47 5749 mova [rsp+gprsize*2+16*19], m0 ;out16 5750 mova [rsp+gprsize*2+16*51], m3 ;out48 5751 mova [rsp+gprsize*2+16*18], m1 ;out15 5752 ret 5753 5754 5755cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2 5756%if ARCH_X86_32 5757 LEA r5, $$ 5758%endif 5759 test eobd, eobd 5760 jz .dconly 5761 5762 call m(idct_64x16_internal_8bpc) 5763 RET 5764 5765.dconly: 5766 movd m1, [o(pw_2896x8)] 5767 pmulhrsw m0, m1, [coeffq] 5768 movd m2, [o(pw_8192)] 5769 mov [coeffq], eobd 5770 mov r3d, 16 5771 lea tx2q, [o(m(inv_txfm_add_dct_dct_64x16_8bpc).end)] 5772 5773.body: 5774 pmulhrsw m0, m2 5775 movd m2, [o(pw_2048)] ;intentionally rip-relative 5776 pmulhrsw m0, m1 5777 pmulhrsw m0, m2 5778 pshuflw m0, m0, q0000 5779 punpcklwd m0, m0 5780 pxor m7, m7 5781 5782.loop: 5783 mova m1, [dstq+16*0] 5784 mova m3, [dstq+16*1] 5785 mova m5, [dstq+16*2] 5786 mova m6, [dstq+16*3] 5787 punpckhbw m2, m1, m7 5788 punpcklbw m1, m7 5789 punpckhbw m4, m3, m7 5790 punpcklbw m3, m7 5791 paddw m2, m0 5792 paddw m1, m0 5793 paddw m4, m0 5794 paddw m3, m0 5795 packuswb m1, m2 5796 packuswb m3, m4 5797 punpckhbw m2, m5, m7 5798 punpcklbw m5, m7 5799 punpckhbw m4, m6, m7 5800 punpcklbw m6, m7 5801 paddw m2, m0 5802 paddw m5, m0 5803 paddw m4, m0 5804 paddw m6, m0 5805 packuswb m5, m2 5806 packuswb m6, m4 5807 mova [dstq+16*0], m1 5808 mova [dstq+16*1], m3 5809 mova [dstq+16*2], m5 5810 mova [dstq+16*3], m6 5811 add dstq, strideq 5812 dec r3d 5813 jg .loop 5814 jmp tx2q 5815 5816.end: 5817 RET 5818 5819 5820%macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2 5821 5822%if %3 5823 mova m3, [o(pw_2896x8)] 5824 pmulhrsw m0, m3, [%1+%2*0] 5825 pmulhrsw m1, m3, [%1+%2*1] 5826 pmulhrsw m2, m3, [%1+%2*2] 5827 pmulhrsw m3, [%1+%2*3] 5828%else 5829 mova m0, [%1+%2*0] 5830 mova m1, [%1+%2*1] 5831 mova m2, [%1+%2*2] 5832 mova m3, [%1+%2*3] 5833%endif 5834%endmacro 5835 5836%macro LOAD_4ROWS_H 2 ;src, stride 5837 mova m4, [%1+%2*0] 5838 mova m5, [%1+%2*1] 5839 mova m6, [%1+%2*2] 5840 mova m7, [%1+%2*3] 5841%endmacro 5842 5843cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 5844 mov r3d, 2 5845 mov [rsp+gprsize*2+16*67], dstq 5846 lea dstq, [rsp+gprsize+16*68] 5847 5848.pass1_loop: 5849 LOAD_4ROWS coeffq+32*0, 32*8 5850 pxor m4, m4 5851 REPX {mova x, m4}, m5, m6, m7 5852 call m(idct_8x8_internal_8bpc).main 5853 SAVE_7ROWS rsp+gprsize+16*3, 16 5854 5855 pxor m4, m4 5856 LOAD_4ROWS coeffq+32*4, 32*8 5857 5858 REPX {mova x, m4}, m5, m6, m7 5859 call m(idct_16x8_internal_8bpc).main 5860 mova m7, [rsp+gprsize+16*0] 5861 SAVE_8ROWS rsp+gprsize+16*11, 16 5862 5863 LOAD_8ROWS coeffq+32*2, 32*4 5864 mova [rsp+gprsize+16*19], m0 5865 mova [rsp+gprsize+16*26], m1 5866 mova [rsp+gprsize+16*23], m2 5867 mova [rsp+gprsize+16*22], m3 5868 mova [rsp+gprsize+16*21], m4 5869 mova [rsp+gprsize+16*24], m5 5870 mova [rsp+gprsize+16*25], m6 5871 mova [rsp+gprsize+16*20], m7 5872 5873 call m(idct_8x32_internal_8bpc).main_fast 5874 SAVE_8ROWS rsp+gprsize+16*3, 16 5875 5876 LOAD_8ROWS coeffq+32*1, 32*2 5877 mova [rsp+gprsize+16*35], m0 ;in1 5878 mova [rsp+gprsize+16*49], m1 ;in3 5879 mova [rsp+gprsize+16*43], m2 ;in5 5880 mova [rsp+gprsize+16*41], m3 ;in7 5881 mova [rsp+gprsize+16*39], m4 ;in9 5882 mova [rsp+gprsize+16*45], m5 ;in11 5883 mova [rsp+gprsize+16*47], m6 ;in13 5884 mova [rsp+gprsize+16*37], m7 ;in15 5885 5886 LOAD_8ROWS coeffq+32*17, 32*2 5887 mova [rsp+gprsize+16*63], m0 ;in17 5888 mova [rsp+gprsize+16*53], m1 ;in19 5889 mova [rsp+gprsize+16*55], m2 ;in21 5890 mova [rsp+gprsize+16*61], m3 ;in23 5891 mova [rsp+gprsize+16*59], m4 ;in25 5892 mova [rsp+gprsize+16*57], m5 ;in27 5893 mova [rsp+gprsize+16*51], m6 ;in29 5894 mova [rsp+gprsize+16*65], m7 ;in31 5895 5896 call m(idct_16x64_internal_8bpc).main 5897 5898 LOAD_8ROWS rsp+gprsize+16*3, 16 5899 mova [rsp+gprsize+16*0], m7 5900 mova m7, [o(pw_8192)] 5901 lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end)] 5902 jmp m(idct_8x8_internal_8bpc).pass1_end1 5903 5904.pass1_end: 5905 SAVE_8ROWS coeffq+32*0, 32 5906 LOAD_8ROWS rsp+gprsize+16*11, 16 5907 mova [rsp+gprsize+16*0], m7 5908 mova m7, [o(pw_8192)] 5909 lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end1)] 5910 jmp m(idct_8x8_internal_8bpc).pass1_end1 5911 5912.pass1_end1: 5913 SAVE_8ROWS coeffq+32*8, 32 5914 LOAD_8ROWS rsp+gprsize+16*19, 16 5915 mova [rsp+gprsize+16*0], m7 5916 mova m7, [o(pw_8192)] 5917 lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end2)] 5918 jmp m(idct_8x8_internal_8bpc).pass1_end1 5919 5920.pass1_end2: 5921 SAVE_8ROWS coeffq+32*16, 32 5922 LOAD_8ROWS rsp+gprsize+16*27, 16 5923 mova [rsp+gprsize+16*0], m7 5924 mova m7, [o(pw_8192)] 5925 lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end3)] 5926 jmp m(idct_8x8_internal_8bpc).pass1_end1 5927 5928.pass1_end3: 5929 SAVE_8ROWS coeffq+32*24, 32 5930 LOAD_8ROWS rsp+gprsize+16*35, 16 5931 mova [rsp+gprsize+16*0], m7 5932 mova m7, [o(pw_8192)] 5933 lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end4)] 5934 jmp m(idct_8x8_internal_8bpc).pass1_end1 5935 5936.pass1_end4: 5937 SAVE_8ROWS dstq+32*0, 32 5938 LOAD_8ROWS rsp+gprsize+16*43, 16 5939 mova [rsp+gprsize+16*0], m7 5940 mova m7, [o(pw_8192)] 5941 lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end5)] 5942 jmp m(idct_8x8_internal_8bpc).pass1_end1 5943 5944.pass1_end5: 5945 SAVE_8ROWS dstq+32*8, 32 5946 LOAD_8ROWS rsp+gprsize+16*51, 16 5947 mova [rsp+gprsize+16*0], m7 5948 mova m7, [o(pw_8192)] 5949 lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end6)] 5950 jmp m(idct_8x8_internal_8bpc).pass1_end1 5951 5952.pass1_end6: 5953 SAVE_8ROWS dstq+32*16, 32 5954 LOAD_8ROWS rsp+gprsize+16*59, 16 5955 mova [rsp+gprsize+16*0], m7 5956 mova m7, [o(pw_8192)] 5957 lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end7)] 5958 jmp m(idct_8x8_internal_8bpc).pass1_end1 5959 5960.pass1_end7: 5961 SAVE_8ROWS dstq+32*24, 32 5962 5963 add coeffq, 16 5964 add dstq, 16 5965 dec r3d 5966 jg .pass1_loop 5967 5968.pass2: 5969 mov dstq, [rsp+gprsize*2+16*67] 5970 sub coeffq, 32 5971 mov r3d, 4 5972 5973.pass2_loop: 5974 mov [rsp+gprsize*1+16*67], r3d 5975 5976 LOAD_4ROWS coeffq+16*0, 32*2 5977 LOAD_4ROWS_H coeffq+16*1, 32*2 5978 call m(idct_8x8_internal_8bpc).main 5979 SAVE_7ROWS rsp+gprsize+16*3, 16 5980 LOAD_4ROWS coeffq+16*2, 32*2 5981 LOAD_4ROWS_H coeffq+16*3, 32*2 5982 call m(idct_16x8_internal_8bpc).main 5983 5984 mov r3, dstq 5985 lea tx2q, [o(m(idct_64x16_internal_8bpc).end)] 5986 lea dstq, [dstq+strideq*8] 5987 jmp m(idct_8x8_internal_8bpc).end 5988 5989.end: 5990 LOAD_8ROWS rsp+gprsize+16*3, 16 5991 mova [rsp+gprsize+16*0], m7 5992 lea tx2q, [o(m(idct_64x16_internal_8bpc).end1)] 5993 mov dstq, r3 5994 jmp m(idct_8x8_internal_8bpc).end 5995 5996.end1: 5997 pxor m7, m7 5998 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 5999 6000 add coeffq, 16*16 6001 mov r3d, [rsp+gprsize*1+16*67] 6002 mov dstq, [rsp+gprsize*2+16*67] 6003 add dstq, 8 6004 mov [rsp+gprsize*2+16*67], dstq 6005 dec r3d 6006 jg .pass2_loop 6007 6008 mov r3d, 4 6009 lea coeffq, [rsp+gprsize+16*68] 6010.pass2_loop2: 6011 mov [rsp+gprsize*1+16*67], r3d 6012 6013 LOAD_4ROWS coeffq+16*0, 32*2 6014 LOAD_4ROWS_H coeffq+16*1, 32*2 6015 call m(idct_8x8_internal_8bpc).main 6016 SAVE_7ROWS rsp+gprsize+16*3, 16 6017 LOAD_4ROWS coeffq+16*2, 32*2 6018 LOAD_4ROWS_H coeffq+16*3, 32*2 6019 call m(idct_16x8_internal_8bpc).main 6020 6021 mov r3, dstq 6022 lea tx2q, [o(m(idct_64x16_internal_8bpc).end2)] 6023 lea dstq, [dstq+strideq*8] 6024 jmp m(idct_8x8_internal_8bpc).end 6025 6026.end2: 6027 LOAD_8ROWS rsp+gprsize+16*3, 16 6028 mova [rsp+gprsize+16*0], m7 6029 lea tx2q, [o(m(idct_64x16_internal_8bpc).end3)] 6030 mov dstq, r3 6031 jmp m(idct_8x8_internal_8bpc).end 6032 6033.end3: 6034 6035 add coeffq, 16*16 6036 mov r3d, [rsp+gprsize*1+16*67] 6037 mov dstq, [rsp+gprsize*2+16*67] 6038 add dstq, 8 6039 mov [rsp+gprsize*2+16*67], dstq 6040 dec r3d 6041 jg .pass2_loop2 6042 ret 6043 6044 6045cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 6046%if ARCH_X86_32 6047 LEA r5, $$ 6048%endif 6049 test eobd, eobd 6050 jz .dconly 6051 6052 call m(idct_32x64_internal_8bpc) 6053 RET 6054 6055.dconly: 6056 movd m1, [o(pw_2896x8)] 6057 pmulhrsw m0, m1, [coeffq] 6058 movd m2, [o(pw_16384)] 6059 mov [coeffq], eobd 6060 pmulhrsw m0, m1 6061 mov r3d, 64 6062 lea tx2q, [o(m(inv_txfm_add_dct_dct_32x64_8bpc).end)] 6063 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body 6064 6065.end: 6066 RET 6067 6068 6069cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 6070 %undef cmp 6071 6072 mov r4d, 2 6073 sub eobd, 136 6074 mov [rsp+gprsize*1+16*67], eobd 6075 mov r3d, 4 6076 cmovs r3d, r4d 6077 6078%if ARCH_X86_32 6079 LEA r5, $$ 6080%endif 6081 6082 mov [rsp+gprsize*2+16*67], coeffq 6083 6084.pass1_loop: 6085 LOAD_8ROWS coeffq+64*1, 64*2, 1 6086 mova [rsp+gprsize+16*19], m0 ;in1 6087 mova [rsp+gprsize+16*26], m1 ;in3 6088 mova [rsp+gprsize+16*23], m2 ;in5 6089 mova [rsp+gprsize+16*22], m3 ;in7 6090 mova [rsp+gprsize+16*21], m4 ;in9 6091 mova [rsp+gprsize+16*24], m5 ;in11 6092 mova [rsp+gprsize+16*25], m6 ;in13 6093 mova [rsp+gprsize+16*20], m7 ;in15 6094 6095 mov tx2d, [rsp+gprsize*1+16*67] 6096 test tx2d, tx2d 6097 jl .fast 6098 6099.full: 6100 LOAD_8ROWS coeffq+64*0, 64*4, 1 6101 call m(idct_8x8_internal_8bpc).main 6102 SAVE_7ROWS rsp+gprsize+16*3, 16 6103 LOAD_8ROWS coeffq+64*2, 64*4, 1 6104 call m(idct_16x8_internal_8bpc).main 6105 mova m7, [rsp+gprsize+16*0] 6106 SAVE_8ROWS rsp+gprsize+16*11, 16 6107 6108 LOAD_8ROWS coeffq+64*17, 64*2, 1 6109 mova [rsp+gprsize+16*33], m0 ;in17 6110 mova [rsp+gprsize+16*28], m1 ;in19 6111 mova [rsp+gprsize+16*29], m2 ;in21 6112 mova [rsp+gprsize+16*32], m3 ;in23 6113 mova [rsp+gprsize+16*31], m4 ;in25 6114 mova [rsp+gprsize+16*30], m5 ;in27 6115 mova [rsp+gprsize+16*27], m6 ;in29 6116 mova [rsp+gprsize+16*34], m7 ;in31 6117 6118 call m(idct_8x32_internal_8bpc).main 6119 jmp .pass1_end 6120 6121.fast: 6122 LOAD_4ROWS coeffq, 256, 1 6123 pxor m4, m4 6124 REPX {mova x, m4}, m5, m6, m7 6125 call m(idct_8x8_internal_8bpc).main 6126 6127 SAVE_7ROWS rsp+gprsize+16*3, 16 6128 LOAD_4ROWS coeffq+128*1, 256, 1 6129 pxor m4, m4 6130 REPX {mova x, m4}, m5, m6, m7 6131 call m(idct_16x8_internal_8bpc).main 6132 mova m7, [rsp+gprsize+16*0] 6133 SAVE_8ROWS rsp+gprsize+16*11, 16 6134 6135 call m(idct_8x32_internal_8bpc).main_fast 6136 6137.pass1_end: 6138 mova [rsp+gprsize+16*0], m7 6139 lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end1)] 6140 jmp m(idct_8x8_internal_8bpc).pass1_end 6141 6142.pass1_end1: 6143 SAVE_8ROWS coeffq+64*0, 64 6144 LOAD_8ROWS rsp+gprsize+16*11, 16 6145 mova [rsp+gprsize+16*0], m7 6146 lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end2)] 6147 jmp m(idct_8x8_internal_8bpc).pass1_end 6148 6149.pass1_end2: 6150 SAVE_8ROWS coeffq+64*8, 64 6151 LOAD_8ROWS rsp+gprsize+16*19, 16 6152 mova [rsp+gprsize+16*0], m7 6153 lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end3)] 6154 jmp m(idct_8x8_internal_8bpc).pass1_end 6155 6156.pass1_end3: 6157 SAVE_8ROWS coeffq+64*16, 64 6158 LOAD_8ROWS rsp+gprsize+16*27, 16 6159 mova [rsp+gprsize+16*0], m7 6160 lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end4)] 6161 jmp m(idct_8x8_internal_8bpc).pass1_end 6162 6163.pass1_end4: 6164 SAVE_8ROWS coeffq+64*24, 64 6165 6166 add coeffq, 16 6167 dec r3d 6168 jg .pass1_loop 6169 6170.pass2: 6171 mov coeffq, [rsp+gprsize*2+16*67] 6172 mov r3d, 4 6173 lea r4, [dstq+8] 6174 mov [rsp+gprsize*2+16*67], r4 6175 lea r4, [o(m(idct_16x64_internal_8bpc).end1)] 6176 jmp m(idct_16x64_internal_8bpc).pass2_loop 6177 6178 6179cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 6180%if ARCH_X86_32 6181 LEA r5, $$ 6182%endif 6183 test eobd, eobd 6184 jz .dconly 6185 6186 call m(idct_64x32_internal_8bpc) 6187 RET 6188 6189.dconly: 6190 movd m1, [o(pw_2896x8)] 6191 pmulhrsw m0, m1, [coeffq] 6192 movd m2, [o(pw_16384)] 6193 pmulhrsw m0, m1 6194 mov [coeffq], eobd 6195 mov r3d, 32 6196 lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)] 6197 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body 6198 6199.end: 6200 RET 6201 6202cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 6203 %undef cmp 6204 6205 mov r4d, 2 6206 sub eobd, 136 6207 mov [rsp+gprsize*1+16*67], eobd 6208 mov r3d, 4 6209 cmovs r3d, r4d 6210 6211%if ARCH_X86_32 6212 LEA r5, $$ 6213%endif 6214 6215 mov [rsp+gprsize*2+16*67], coeffq 6216 mov [rsp+gprsize*3+16*67], dstq 6217 lea dstq, [rsp+gprsize+16*69] 6218 mov [rsp+gprsize*4+16*67], dstq 6219 6220.pass1_loop: 6221 LOAD_4ROWS coeffq+64*0, 64*8, 1 6222 pxor m4, m4 6223 REPX {mova x, m4}, m5, m6, m7 6224 call m(idct_8x8_internal_8bpc).main 6225 SAVE_7ROWS rsp+gprsize+16*3, 16 6226 6227 pxor m4, m4 6228 LOAD_4ROWS coeffq+64*4, 64*8, 1 6229 6230 REPX {mova x, m4}, m5, m6, m7 6231 call m(idct_16x8_internal_8bpc).main 6232 mova m7, [rsp+gprsize+16*0] 6233 SAVE_8ROWS rsp+gprsize+16*11, 16 6234 6235 LOAD_8ROWS coeffq+64*2, 64*4, 1 6236 mova [rsp+gprsize+16*19], m0 6237 mova [rsp+gprsize+16*26], m1 6238 mova [rsp+gprsize+16*23], m2 6239 mova [rsp+gprsize+16*22], m3 6240 mova [rsp+gprsize+16*21], m4 6241 mova [rsp+gprsize+16*24], m5 6242 mova [rsp+gprsize+16*25], m6 6243 mova [rsp+gprsize+16*20], m7 6244 6245 call m(idct_8x32_internal_8bpc).main_fast 6246 SAVE_8ROWS rsp+gprsize+16*3, 16 6247 6248 LOAD_8ROWS coeffq+64*1, 64*2, 1 6249 mova [rsp+gprsize+16*35], m0 ;in1 6250 mova [rsp+gprsize+16*49], m1 ;in3 6251 mova [rsp+gprsize+16*43], m2 ;in5 6252 mova [rsp+gprsize+16*41], m3 ;in7 6253 mova [rsp+gprsize+16*39], m4 ;in9 6254 mova [rsp+gprsize+16*45], m5 ;in11 6255 mova [rsp+gprsize+16*47], m6 ;in13 6256 mova [rsp+gprsize+16*37], m7 ;in15 6257 6258 LOAD_8ROWS coeffq+64*17, 64*2, 1 6259 mova [rsp+gprsize+16*63], m0 ;in17 6260 mova [rsp+gprsize+16*53], m1 ;in19 6261 mova [rsp+gprsize+16*55], m2 ;in21 6262 mova [rsp+gprsize+16*61], m3 ;in23 6263 mova [rsp+gprsize+16*59], m4 ;in25 6264 mova [rsp+gprsize+16*57], m5 ;in27 6265 mova [rsp+gprsize+16*51], m6 ;in29 6266 mova [rsp+gprsize+16*65], m7 ;in31 6267 6268 call m(idct_16x64_internal_8bpc).main 6269 6270 LOAD_8ROWS rsp+gprsize+16*3, 16 6271 mova [rsp+gprsize+16*0], m7 6272 lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end)] 6273 jmp m(idct_8x8_internal_8bpc).pass1_end 6274 6275.pass1_end: 6276 SAVE_8ROWS coeffq+64*0, 64 6277 LOAD_8ROWS rsp+gprsize+16*11, 16 6278 mova [rsp+gprsize+16*0], m7 6279 lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end1)] 6280 jmp m(idct_8x8_internal_8bpc).pass1_end 6281 6282.pass1_end1: 6283 SAVE_8ROWS coeffq+64*8, 64 6284 LOAD_8ROWS rsp+gprsize+16*19, 16 6285 mova [rsp+gprsize+16*0], m7 6286 lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end2)] 6287 jmp m(idct_8x8_internal_8bpc).pass1_end 6288 6289.pass1_end2: 6290 SAVE_8ROWS coeffq+64*16, 64 6291 LOAD_8ROWS rsp+gprsize+16*27, 16 6292 mova [rsp+gprsize+16*0], m7 6293 lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end3)] 6294 jmp m(idct_8x8_internal_8bpc).pass1_end 6295 6296.pass1_end3: 6297 SAVE_8ROWS coeffq+64*24, 64 6298 LOAD_8ROWS rsp+gprsize+16*35, 16 6299 mova [rsp+gprsize+16*0], m7 6300 lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end4)] 6301 jmp m(idct_8x8_internal_8bpc).pass1_end 6302 6303.pass1_end4: 6304 SAVE_8ROWS dstq+64*0, 64 6305 LOAD_8ROWS rsp+gprsize+16*43, 16 6306 mova [rsp+gprsize+16*0], m7 6307 lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end5)] 6308 jmp m(idct_8x8_internal_8bpc).pass1_end 6309 6310.pass1_end5: 6311 SAVE_8ROWS dstq+64*8, 64 6312 LOAD_8ROWS rsp+gprsize+16*51, 16 6313 mova [rsp+gprsize+16*0], m7 6314 lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end6)] 6315 jmp m(idct_8x8_internal_8bpc).pass1_end 6316 6317.pass1_end6: 6318 SAVE_8ROWS dstq+64*16, 64 6319 LOAD_8ROWS rsp+gprsize+16*59, 16 6320 mova [rsp+gprsize+16*0], m7 6321 lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end7)] 6322 jmp m(idct_8x8_internal_8bpc).pass1_end 6323 6324.pass1_end7: 6325 SAVE_8ROWS dstq+64*24, 64 6326 6327 add coeffq, 16 6328 add dstq, 16 6329 dec r3d 6330 jg .pass1_loop 6331 6332.pass2: 6333 mov coeffq, [rsp+gprsize*4+16*67] 6334 mov dstq, [rsp+gprsize*3+16*67] 6335 mov eobd, [rsp+gprsize*1+16*67] 6336 lea dstq, [dstq+32] 6337 mov [rsp+gprsize*1+16*35], eobd 6338 lea tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)] 6339 mov r3d, 4 6340 jmp m(idct_32x32_internal_8bpc).pass2_loop 6341 6342.pass2_end: 6343 mova [rsp+gprsize+16*0], m7 6344 lea r3, [o(m(idct_64x32_internal_8bpc).pass2_end1)] 6345 jmp m(idct_8x32_internal_8bpc).end2 6346 6347.pass2_end1: 6348 lea tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)] 6349 add coeffq, 16*32 6350 mov dstq, [rsp+gprsize*2+16*35] 6351 mov r3d, [rsp+gprsize*3+16*35] 6352 dec r3d 6353 jg m(idct_32x32_internal_8bpc).pass2_loop 6354 6355.pass2_end2: 6356 mov dstq, [rsp+gprsize*3+16*67] 6357 mov coeffq, [rsp+gprsize*2+16*67] 6358 lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] 6359 mov r3d, 4 6360 jmp m(idct_32x32_internal_8bpc).pass2_loop 6361 6362 6363cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 6364%if ARCH_X86_32 6365 LEA r5, $$ 6366%endif 6367 test eobd, eobd 6368 jz .dconly 6369 6370 call m(idct_64x64_internal_8bpc) 6371 RET 6372 6373.dconly: 6374 movd m1, [o(pw_2896x8)] 6375 pmulhrsw m0, m1, [coeffq] 6376 movd m2, [o(pw_8192)] 6377 mov [coeffq], eobd 6378 mov r3d, 64 6379 lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)] 6380 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body 6381 6382cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 6383 %undef cmp 6384 6385 mov r5d, 4 6386 mov r4d, 2 6387 sub eobd, 136 6388 cmovns r4d, r5d 6389 6390%if ARCH_X86_32 6391 LEA r5, $$ 6392%endif 6393 6394 mov [rsp+gprsize*1+16*67], eobd 6395 mov r3d, r4d 6396 mov [rsp+gprsize*4+16*67], coeffq 6397 mov [rsp+gprsize*3+16*67], dstq 6398 lea dstq, [rsp+gprsize+16*69] 6399 mov [rsp+gprsize*2+16*67], dstq 6400 6401.pass1_loop: 6402 LOAD_4ROWS coeffq+64*0, 64*8 6403 pxor m4, m4 6404 REPX {mova x, m4}, m5, m6, m7 6405 call m(idct_8x8_internal_8bpc).main 6406 SAVE_7ROWS rsp+gprsize+16*3, 16 6407 6408 pxor m4, m4 6409 LOAD_4ROWS coeffq+64*4, 64*8 6410 6411 REPX {mova x, m4}, m5, m6, m7 6412 call m(idct_16x8_internal_8bpc).main 6413 mova m7, [rsp+gprsize+16*0] 6414 SAVE_8ROWS rsp+gprsize+16*11, 16 6415 6416 LOAD_8ROWS coeffq+64*2, 64*4 6417 mova [rsp+gprsize+16*19], m0 6418 mova [rsp+gprsize+16*26], m1 6419 mova [rsp+gprsize+16*23], m2 6420 mova [rsp+gprsize+16*22], m3 6421 mova [rsp+gprsize+16*21], m4 6422 mova [rsp+gprsize+16*24], m5 6423 mova [rsp+gprsize+16*25], m6 6424 mova [rsp+gprsize+16*20], m7 6425 6426 call m(idct_8x32_internal_8bpc).main_fast 6427 SAVE_8ROWS rsp+gprsize+16*3, 16 6428 6429 LOAD_8ROWS coeffq+64*1, 64*2 6430 mova [rsp+gprsize+16*35], m0 ;in1 6431 mova [rsp+gprsize+16*49], m1 ;in3 6432 mova [rsp+gprsize+16*43], m2 ;in5 6433 mova [rsp+gprsize+16*41], m3 ;in7 6434 mova [rsp+gprsize+16*39], m4 ;in9 6435 mova [rsp+gprsize+16*45], m5 ;in11 6436 mova [rsp+gprsize+16*47], m6 ;in13 6437 mova [rsp+gprsize+16*37], m7 ;in15 6438 6439 LOAD_8ROWS coeffq+64*17, 64*2 6440 mova [rsp+gprsize+16*63], m0 ;in17 6441 mova [rsp+gprsize+16*53], m1 ;in19 6442 mova [rsp+gprsize+16*55], m2 ;in21 6443 mova [rsp+gprsize+16*61], m3 ;in23 6444 mova [rsp+gprsize+16*59], m4 ;in25 6445 mova [rsp+gprsize+16*57], m5 ;in27 6446 mova [rsp+gprsize+16*51], m6 ;in29 6447 mova [rsp+gprsize+16*65], m7 ;in31 6448 6449 call m(idct_16x64_internal_8bpc).main 6450 6451 LOAD_8ROWS rsp+gprsize+16*3, 16 6452 mova [rsp+gprsize+16*0], m7 6453 mova m7, [o(pw_8192)] 6454 lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end)] 6455 jmp m(idct_8x8_internal_8bpc).pass1_end1 6456 6457.pass1_end: 6458 SAVE_8ROWS coeffq+64*0, 64 6459 LOAD_8ROWS rsp+gprsize+16*11, 16 6460 mova [rsp+gprsize+16*0], m7 6461 mova m7, [o(pw_8192)] 6462 lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end1)] 6463 jmp m(idct_8x8_internal_8bpc).pass1_end1 6464 6465.pass1_end1: 6466 SAVE_8ROWS coeffq+64*8, 64 6467 LOAD_8ROWS rsp+gprsize+16*19, 16 6468 mova [rsp+gprsize+16*0], m7 6469 mova m7, [o(pw_8192)] 6470 lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end2)] 6471 jmp m(idct_8x8_internal_8bpc).pass1_end1 6472 6473.pass1_end2: 6474 SAVE_8ROWS coeffq+64*16, 64 6475 LOAD_8ROWS rsp+gprsize+16*27, 16 6476 mova [rsp+gprsize+16*0], m7 6477 mova m7, [o(pw_8192)] 6478 lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end3)] 6479 jmp m(idct_8x8_internal_8bpc).pass1_end1 6480 6481.pass1_end3: 6482 SAVE_8ROWS coeffq+64*24, 64 6483 LOAD_8ROWS rsp+gprsize+16*35, 16 6484 mova [rsp+gprsize+16*0], m7 6485 mova m7, [o(pw_8192)] 6486 lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end4)] 6487 jmp m(idct_8x8_internal_8bpc).pass1_end1 6488 6489.pass1_end4: 6490 SAVE_8ROWS dstq+64*0, 64 6491 LOAD_8ROWS rsp+gprsize+16*43, 16 6492 mova [rsp+gprsize+16*0], m7 6493 mova m7, [o(pw_8192)] 6494 lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end5)] 6495 jmp m(idct_8x8_internal_8bpc).pass1_end1 6496 6497.pass1_end5: 6498 SAVE_8ROWS dstq+64*8, 64 6499 LOAD_8ROWS rsp+gprsize+16*51, 16 6500 mova [rsp+gprsize+16*0], m7 6501 mova m7, [o(pw_8192)] 6502 lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end6)] 6503 jmp m(idct_8x8_internal_8bpc).pass1_end1 6504 6505.pass1_end6: 6506 SAVE_8ROWS dstq+64*16, 64 6507 LOAD_8ROWS rsp+gprsize+16*59, 16 6508 mova [rsp+gprsize+16*0], m7 6509 mova m7, [o(pw_8192)] 6510 lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end7)] 6511 jmp m(idct_8x8_internal_8bpc).pass1_end1 6512 6513.pass1_end7: 6514 SAVE_8ROWS dstq+64*24, 64 6515 6516 add coeffq, 16 6517 add dstq, 16 6518 dec r3d 6519 jg .pass1_loop 6520 6521.pass2: 6522 mov dstq, [rsp+gprsize*3+16*67] 6523 mov coeffq, [rsp+gprsize*2+16*67] 6524 lea dstq, [dstq+32] 6525 mov r3d, 4 6526 lea r4, [dstq+8] 6527 mov [rsp+gprsize*2+16*67], r4 6528 lea r4, [o(m(idct_64x64_internal_8bpc).pass2_end)] 6529 jmp m(idct_16x64_internal_8bpc).pass2_loop 6530 6531.pass2_end: 6532 LOAD_8ROWS rsp+gprsize+16*35, 16 6533 lea dstq, [dstq+strideq*2] 6534 add rsp, 16*32 6535 mova [rsp+gprsize+16*0], m7 6536 lea r3, [o(m(idct_64x64_internal_8bpc).pass2_end1)] 6537 jmp m(idct_8x32_internal_8bpc).end2 6538 6539.pass2_end1: 6540 add coeffq, 16*32 6541 sub rsp, 16*32 6542 6543 mov dstq, [rsp+gprsize*2+16*67] 6544 mov r3d, [rsp+gprsize*3+16*67] 6545 lea r4, [dstq+8] 6546 mov [rsp+gprsize*2+16*67], r4 6547 lea r4, [o(m(idct_64x64_internal_8bpc).pass2_end)] 6548 6549 dec r3d 6550 jg m(idct_16x64_internal_8bpc).pass2_loop 6551 6552.pass2_end2: 6553 mov coeffq, [rsp+gprsize*4+16*67] 6554 mov dstq, [rsp+gprsize*2+16*67] 6555 mov r3d, 4 6556 sub dstq, 72 6557 lea r4, [dstq+8] 6558 mov [rsp+gprsize*2+16*67], r4 6559 lea r4, [o(m(idct_16x64_internal_8bpc).end1)] 6560 jmp m(idct_16x64_internal_8bpc).pass2_loop 6561