1; Copyright © 2020, VideoLAN and dav1d authors 2; Copyright © 2020, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 64 32int8_permA: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 33 db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 34 db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 35 db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 36int8_permB: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 37 db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 38 db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 39 db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 40int16_perm: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 41 db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 42 db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 43 db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 44dup16_perm: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 45 db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15 46 db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23 47 db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31 48idtx_16x4p: db 0, 1, 4, 5, 16, 17, 20, 21, 2, 3, 6, 7, 18, 19, 22, 23 49 db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55 50 db 8, 9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31 51 db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63 52idct_8x32p: db 60, 61, 4, 5, 32, 33, 0, 1, 28, 29, 36, 37, 56, 57, 8, 9 53 db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17 54 db 62, 63, 2, 3, 6, 7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51 55 db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35 56idct_16x32p: db 6, 7, 58, 59, 38, 39, 26, 27, 32, 33, 0, 1, 30, 31, 34, 35 57 db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21 58 db 62, 63, 2, 3, 48, 49, 16, 17, 56, 57, 8, 9, 14, 15, 50, 51 59 db 54, 55, 10, 11, 60, 61, 4, 5, 12, 13, 52, 53, 28, 29, 36, 37 60end_16x32p: db 0, 32, 1, 48, 2, 36, 3, 52, 16, 40, 17, 56, 18, 44, 19, 60 61 db 4, 33, 5, 49, 6, 37, 7, 53, 20, 41, 21, 57, 22, 45, 23, 61 62 db 8, 35, 9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63 63 db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62 64 65; packed 4-bit qword shuffle indices 66permA: dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262 67 dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373 68 dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb 69 dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea 70permB: dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604 71 dq 0xc824352d56128751, 0xd906171e74301e15 72 dq 0x6271604b03472d62, 0x735342782165b426 73 dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37 74permC: dq 0x9d409d041551c2e0, 0xbf62bf263773a486 75 dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597 76 dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e 77 dq 0x5115049dd9045b79, 0x733726bffb263d1f 78permD: dq 0x0cda098800041504, 0x0edb09b2028c3726 79 dq 0x0f11fa9c01150415, 0x0988f326039d2637 80 dq 0x05640f1108269d8c, 0x05290edb0aaebfae 81 dq 0x0005000509378c9d, 0xffffffff0bbfaebf 82 83pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 84gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11 85gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13 86gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10 87gather8d: dd 0, 3, 1, 2, 8, 11, 9, 10 88 89int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 90int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 91int_shuf3: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 92int_shuf4: db 8, 9, 0, 1, 12, 13, 4, 5, 10, 11, 2, 3, 14, 15, 6, 7 93deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 94int_mshift: db 12, 20, 0, 0, 44, 52, 0, 0 95 96pb_32: times 4 db 32 97pw_2048: times 2 dw 2048 98pw_4096: times 2 dw 4096 99pw_8192: times 2 dw 8192 100pw_16384: times 2 dw 16384 101pw_1697x16: times 2 dw 1697*16 102pw_1697x8: times 2 dw 1697*8 103pw_2896x8: times 2 dw 2896*8 104pd_2048: dd 2048 105 106%define pw_5 (permD+52) 107%define pd_m1 (permD+60) 108%define pw_3803_1321 (permD+44) 109%define pw_2482_3803 (permD+12) 110%define pw_2440_3290 (permD+ 4) 111%define pw_m3290_2440 (permD+28) 112%define pw_3857_1380 (permD+36) 113%define pw_m1380_3857 (permD+20) 114 115pw_8192_m8192: dw 8192, -8192 116pw_m8192_8192: dw -8192, 8192 117pw_16384_m16384: dw 16384, -16384 118pw_m16384_16384: dw -16384, 16384 119 120pw_m1321_2482: dw -1321, 2482 121pw_m3344_3344: dw -3344, 3344 122pw_2482_3344: dw 2482, 3344 123pw_m3803_3344: dw -3803, 3344 124pd_3344: dd 3344 125pw_m1321_m3344: dw -1321, -3344 126pw_2896_m2896: dw 2896, -2896 127 128pw_1567_m3784: dw 1567, -3784 129pw_3784_m1567: dw 3784, -1567 130pw_4017_m799: dw 4017, -799 131pw_2276_m3406: dw 2276, -3406 132pw_m799_m4017: dw -799, -4017 133pw_m3406_m2276: dw -3406, -2276 134 135%macro COEF_PAIR 2-3 0 136pw_%1_%2: dw %1, %2 137pw_m%2_%1: dw -%2, %1 138%if %3 139pw_m%1_m%2: dw -%1, -%2 140%endif 141%endmacro 142 143COEF_PAIR 2896, 2896 144COEF_PAIR 1567, 3784, 1 145COEF_PAIR 3784, 1567 146COEF_PAIR 201, 4091 147COEF_PAIR 995, 3973 148COEF_PAIR 1751, 3703 149COEF_PAIR 3035, 2751 150COEF_PAIR 3513, 2106 151COEF_PAIR 4052, 601 152COEF_PAIR 3166, 2598, 1 153COEF_PAIR 3920, 1189, 1 154COEF_PAIR 2276, 3406 155COEF_PAIR 4017, 799 156 157%macro COEF_X8 1-* 158%rep %0 159 dw %1*8, %1*8 160 %rotate 1 161%endrep 162%endmacro 163 164pw_m2276x8: COEF_X8 -2276 165pw_3406x8: COEF_X8 3406 166pw_4017x8: COEF_X8 4017 167pw_799x8: COEF_X8 799 168pw_3784x8: COEF_X8 3784 169pw_1567x8: COEF_X8 1567 170 171pw_4076x8: COEF_X8 4076 172pw_401x8: COEF_X8 401 173pw_m2598x8: COEF_X8 -2598 174pw_3166x8: COEF_X8 3166 175pw_3612x8: COEF_X8 3612 176pw_1931x8: COEF_X8 1931 177pw_m1189x8: COEF_X8 -1189 178pw_3920x8: COEF_X8 3920 179 180pw_4091x8: COEF_X8 4091 181pw_201x8: COEF_X8 201 182pw_m2751x8: COEF_X8 -2751 183pw_3035x8: COEF_X8 3035 184pw_3703x8: COEF_X8 3703 185pw_1751x8: COEF_X8 1751 186pw_m1380x8: COEF_X8 -1380 187pw_3857x8: COEF_X8 3857 188pw_3973x8: COEF_X8 3973 189pw_995x8: COEF_X8 995 190pw_m2106x8: COEF_X8 -2106 191pw_3513x8: COEF_X8 3513 192pw_3290x8: COEF_X8 3290 193pw_2440x8: COEF_X8 2440 194pw_m601x8: COEF_X8 -601 195pw_4052x8: COEF_X8 4052 196 197pw_401_4076x8: dw 401*8, 4076*8 198pw_m2598_3166x8: dw -2598*8, 3166*8 199pw_1931_3612x8: dw 1931*8, 3612*8 200pw_m1189_3920x8: dw -1189*8, 3920*8 201pw_799_4017x8: dw 799*8, 4017*8 202pw_m2276_3406x8: dw -2276*8, 3406*8 203 204pw_201_4091x8: dw 201*8, 4091*8 205pw_m601_4052x8: dw -601*8, 4052*8 206pw_995_3973x8: dw 995*8, 3973*8 207pw_m1380_3857x8: dw -1380*8, 3857*8 208pw_1751_3703x8: dw 1751*8, 3703*8 209pw_m2106_3513x8: dw -2106*8, 3513*8 210pw_2440_3290x8: dw 2440*8, 3290*8 211pw_m2751_3035x8: dw -2751*8, 3035*8 212 213pw_101_4095x8: dw 101*8, 4095*8 214pw_m2824_2967x8: dw -2824*8, 2967*8 215pw_1660_3745x8: dw 1660*8, 3745*8 216pw_m1474_3822x8: dw -1474*8, 3822*8 217pw_897_3996x8: dw 897*8, 3996*8 218pw_m2191_3461x8: dw -2191*8, 3461*8 219pw_2359_3349x8: dw 2359*8, 3349*8 220pw_m700_4036x8: dw -700*8, 4036*8 221pw_501_4065x8: dw 501*8, 4065*8 222pw_m2520_3229x8: dw -2520*8, 3229*8 223pw_2019_3564x8: dw 2019*8, 3564*8 224pw_m1092_3948x8: dw -1092*8, 3948*8 225pw_1285_3889x8: dw 1285*8, 3889*8 226pw_m1842_3659x8: dw -1842*8, 3659*8 227pw_2675_3102x8: dw 2675*8, 3102*8 228pw_m301_4085x8: dw -301*8, 4085*8 229 230idct64_mul: COEF_X8 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474 231COEF_PAIR 401, 4076, 1 232COEF_PAIR 799, 4017 233 COEF_X8 -700, 4036, 2359, 3349, -2191, 3461, 897, 3996 234dw -2598, -3166, 3166, -2598, 2598, 3166, -4017, -799, 799, -4017 235 COEF_X8 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092 236COEF_PAIR 1931, 3612, 1 237COEF_PAIR 3406, 2276 238 COEF_X8 -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889 239dw -1189, -3920, 3920, -1189, 1189, 3920, -2276, -3406, 3406, -2276 240 241SECTION .text 242 243%define o_base int8_permA+64*18 244%define o(x) (r5 - (o_base) + (x)) 245%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) 246 247; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack, 248; 16 = special_mul1, 32 = special_mul2 249%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags 250 mova m%2, m%4 251%if %7 & 16 252 vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd} 253 mova m%3, m%4 254%if %7 & 32 255 vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} 256%else 257 vpdpwssd m%3, m%1, m%6 258%endif 259%elif %7 & 32 260 vpdpwssd m%2, m%1, m%5 261 mova m%3, m%4 262 vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} 263%elif %6 < 32 264 vpdpwssd m%2, m%1, m%5 265 mova m%3, m%4 266 vpdpwssd m%3, m%1, m%6 267%elif %7 & 1 268 vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd} 269 mova m%3, m%4 270 vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd} 271%else 272 vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd} 273 mova m%3, m%4 274 vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd} 275%endif 276%if %7 & 2 277 psrld m%2, 12 278 pslld m%3, 4 279 vpshrdd m%1, m%3, m%2, 16 280%elif %7 & 4 281 ; compared to using shifts (as above) this has better throughput, 282 ; but worse latency and requires setting up the opmask/index 283 ; registers, so only use this method for the larger transforms 284 pslld m%1, m%2, 4 285 vpmultishiftqb m%1{k7}, m13, m%3 286%else 287 psrad m%2, 12 288 psrad m%3, 12 289%if %7 & 8 == 0 290 packssdw m%1, m%3, m%2 291%endif 292%endif 293%endmacro 294 295; flags: same as ITX_MUL2X_PACK 296%macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags 297%if %11 & 1 298 vpbroadcastd m%4, [o(pw_%9_%10)] 299 vpbroadcastd m%4{k1}, [o(pw_%7_%8)] 300 vpbroadcastd m%5, [o(pw_m%10_%9)] 301 vpbroadcastd m%5{k1}, [o(pw_m%8_%7)] 302%else 303 vpbroadcastd m%4, [o(pw_m%10_%9)] 304 vpbroadcastd m%4{k1}, [o(pw_m%8_%7)] 305 vpbroadcastd m%5, [o(pw_%9_%10)] 306 vpbroadcastd m%5{k1}, [o(pw_%7_%8)] 307%endif 308 ITX_MUL2X_PACK %1, %2, %3, %6, %4, %5, %11 309%endmacro 310 311; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 312; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 313%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 314 punpcklwd m%3, m%2, m%1 315 punpckhwd m%2, m%1 316%if %7 < 32 317 mova m%1, m%5 318 vpdpwssd m%1, m%3, m%7 319 mova m%4, m%5 320 vpdpwssd m%4, m%2, m%7 321%else 322 mova m%1, m%5 323 vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd} 324 mova m%4, m%5 325 vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd} 326%endif 327 psrad m%1, 12 328 psrad m%4, 12 329 packssdw m%1, m%4 330 mova m%4, m%5 331%if %7 < 32 332 vpdpwssd m%4, m%2, m%6 333 mova m%2, m%5 334 vpdpwssd m%2, m%3, m%6 335%else 336 vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd} 337 mova m%2, m%5 338 vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd} 339%endif 340 psrad m%4, 12 341 psrad m%2, 12 342%if %0 == 8 343 packssdw m%8, m%2, m%4 344%else 345 packssdw m%2, m%4 346%endif 347%endmacro 348 349%macro WRAP_XMM 1+ 350 %xdefine %%reset RESET_MM_PERMUTATION 351 INIT_XMM cpuname 352 DEFINE_MMREGS xmm 353 AVX512_MM_PERMUTATION 354 %1 355 %%reset 356%endmacro 357 358%macro WRAP_YMM 1+ 359 INIT_YMM cpuname 360 %1 361 INIT_ZMM cpuname 362%endmacro 363 364%macro ITX4_END 4-5 2048 ; row[1-4], rnd 365%if %5 366 vpbroadcastd m2, [o(pw_%5)] 367 pmulhrsw m0, m2 368 pmulhrsw m1, m2 369%endif 370 lea r2, [dstq+strideq*2] 371%assign %%i 1 372%rep 4 373 %if %1 & 2 374 CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) 375 %else 376 CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) 377 %endif 378 %assign %%i %%i + 1 379 %rotate 1 380%endrep 381 movd m2, [%%row_adr1] 382 pinsrd m2, [%%row_adr2], 1 383 movd m3, [%%row_adr3] 384 pinsrd m3, [%%row_adr4], 1 385 pmovzxbw m2, m2 386 pmovzxbw m3, m3 387 paddw m0, m2 388 paddw m1, m3 389 packuswb m0, m1 390 movd [%%row_adr1], m0 391 pextrd [%%row_adr2], m0, 1 392 pextrd [%%row_adr3], m0, 2 393 pextrd [%%row_adr4], m0, 3 394 ret 395%endmacro 396 397%macro INV_TXFM_FN 3 ; type1, type2, size 398cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base 399 %define %%p1 m(i%1_%3_internal_8bpc) 400 lea baseq, [o_base] 401 ; Jump to the 1st txfm function if we're not taking the fast path, which 402 ; in turn performs an indirect jump to the 2nd txfm function. 403 lea tx2q, [m(i%2_%3_internal_8bpc).pass2] 404%ifidn %1_%2, dct_dct 405 test eobd, eobd 406 jnz %%p1 407%else 408 ; jump to the 1st txfm function unless it's located directly after this 409 times ((%%end - %%p1) >> 31) & 1 jmp %%p1 410ALIGN function_align 411%%end: 412%endif 413%endmacro 414 415%macro INV_TXFM_4X4_FN 2 ; type1, type2 416 INV_TXFM_FN %1, %2, 4x4 417%ifidn %1_%2, dct_dct 418 vpbroadcastw m0, [cq] 419 vpbroadcastd m1, [o(pw_2896x8)] 420 pmulhrsw m0, m1 421 mov [cq], eobd 422 pmulhrsw m0, m1 423 mova m1, m0 424 jmp m(iadst_4x4_internal_8bpc).end2 425%endif 426%endmacro 427 428%macro IDCT4_1D_PACKED 0 429 vpbroadcastd m4, [o(pd_2048)] 430 punpckhwd m2, m1, m0 431 punpcklwd m1, m0 432 ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 433 ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 434 paddsw m0, m1, m2 ; out0 out1 435 psubsw m1, m2 ; out3 out2 436%endmacro 437 438%macro IADST4_1D_PACKED 0 439 punpcklwd m4, m1, m0 ; in2 in0 440 punpckhwd m5, m1, m0 ; in3 in1 441.main2: 442 vpbroadcastd m3, [o(pd_2048)] 443 mova m0, m3 444 vpdpwssd m0, m4, [o(pw_3803_1321)] {bcstd} 445 mova m2, m3 446 vpdpwssd m2, m4, [o(pw_m1321_2482)] {bcstd} 447 mova m1, m3 448 vpdpwssd m1, m4, [o(pw_m3344_3344)] {bcstd} 449 vpdpwssd m3, m4, [o(pw_2482_3803)] {bcstd} 450 vpdpwssd m0, m5, [o(pw_2482_3344)] {bcstd} 451 vpdpwssd m2, m5, [o(pw_m3803_3344)] {bcstd} 452 vpdpwssd m1, m5, [o(pd_3344)] {bcstd} 453 vpdpwssd m3, m5, [o(pw_m1321_m3344)] {bcstd} 454 REPX {psrad x, 12}, m0, m2, m1, m3 455 packssdw m0, m2 ; out0 out1 456 packssdw m1, m3 ; out2 out3 457%endmacro 458 459INIT_XMM avx512icl 460INV_TXFM_4X4_FN dct, dct 461INV_TXFM_4X4_FN dct, adst 462INV_TXFM_4X4_FN dct, flipadst 463INV_TXFM_4X4_FN dct, identity 464 465cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 466 mova m0, [cq+16*0] 467 mova m1, [cq+16*1] 468 IDCT4_1D_PACKED 469 mova m2, [o(deint_shuf)] 470 shufps m3, m0, m1, q1331 471 shufps m0, m0, m1, q0220 472 pshufb m0, m2 473 pshufb m1, m3, m2 474 jmp tx2q 475.pass2: 476 IDCT4_1D_PACKED 477 pxor ymm16, ymm16 478 mova [cq], ymm16 479 ITX4_END 0, 1, 3, 2 480 481INV_TXFM_4X4_FN adst, dct 482INV_TXFM_4X4_FN adst, adst 483INV_TXFM_4X4_FN adst, flipadst 484INV_TXFM_4X4_FN adst, identity 485 486cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 487 mova m0, [cq+16*0] 488 mova m1, [cq+16*1] 489 call .main 490 punpckhwd m3, m0, m1 491 punpcklwd m0, m1 492 punpckhwd m1, m0, m3 493 punpcklwd m0, m3 494 jmp tx2q 495.pass2: 496 call .main 497.end: 498 pxor ymm16, ymm16 499 mova [cq], ymm16 500.end2: 501 ITX4_END 0, 1, 2, 3 502ALIGN function_align 503.main: 504 IADST4_1D_PACKED 505 ret 506 507INV_TXFM_4X4_FN flipadst, dct 508INV_TXFM_4X4_FN flipadst, adst 509INV_TXFM_4X4_FN flipadst, flipadst 510INV_TXFM_4X4_FN flipadst, identity 511 512cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 513 mova m0, [cq+16*0] 514 mova m1, [cq+16*1] 515 call m(iadst_4x4_internal_8bpc).main 516 punpcklwd m2, m1, m0 517 punpckhwd m1, m0 518 punpcklwd m0, m1, m2 519 punpckhwd m1, m2 520 jmp tx2q 521.pass2: 522 call m(iadst_4x4_internal_8bpc).main 523.end: 524 pxor ymm16, ymm16 525 mova [cq], ymm16 526.end2: 527 ITX4_END 3, 2, 1, 0 528 529INV_TXFM_4X4_FN identity, dct 530INV_TXFM_4X4_FN identity, adst 531INV_TXFM_4X4_FN identity, flipadst 532INV_TXFM_4X4_FN identity, identity 533 534cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 535 mova m0, [cq+16*0] 536 mova m1, [cq+16*1] 537 vpbroadcastd m3, [o(pw_1697x8)] 538 pmulhrsw m2, m3, m0 539 pmulhrsw m3, m1 540 paddsw m0, m2 541 paddsw m1, m3 542 punpckhwd m2, m0, m1 543 punpcklwd m0, m1 544 punpckhwd m1, m0, m2 545 punpcklwd m0, m2 546 jmp tx2q 547.pass2: 548 vpbroadcastd m3, [o(pw_1697x8)] 549 pmulhrsw m2, m3, m0 550 pmulhrsw m3, m1 551 paddsw m0, m2 552 paddsw m1, m3 553 jmp m(iadst_4x4_internal_8bpc).end 554 555%macro INV_TXFM_4X8_FN 2 ; type1, type2 556 INV_TXFM_FN %1, %2, 4x8 557%ifidn %1_%2, dct_dct 558 movd xmm1, [o(pw_2896x8)] 559 pmulhrsw xmm0, xmm1, [cq] 560 movd xmm2, [o(pw_2048)] 561 pmulhrsw xmm0, xmm1 562 pmulhrsw xmm0, xmm1 563 pmulhrsw xmm0, xmm2 564 vpbroadcastw ym0, xmm0 565 mova ym1, ym0 566 jmp m(iadst_4x8_internal_8bpc).end3 567%endif 568%endmacro 569 570%macro IDCT8_1D_PACKED 0 571 punpckhwd m5, m3, m0 ; in7 in1 572 punpckhwd m4, m1, m2 ; in3 in5 573 punpcklwd m3, m1 ; in6 in2 574 punpcklwd m2, m0 ; in4 in0 575.main2: 576 vpbroadcastd m6, [o(pd_2048)] 577 ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a 578 ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a 579 ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 580 psubsw m0, m5, m4 ; t5a t6a (interleaved) 581 paddsw m4, m5 ; t4 t7 (interleaved) 582 ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 583 ITX_MUL2X_PACK 0, 1, 5, 6, 2896, 2896, 1 ; t6 t5 584%if mmsize > 16 585 vbroadcasti32x4 m1, [o(deint_shuf)] 586 pshufb m4, m1 587%else 588 pshufb m4, [o(deint_shuf)] 589%endif 590 psubsw m1, m2, m3 ; tmp3 tmp2 591 paddsw m3, m2 ; tmp0 tmp1 592 punpckhqdq m2, m4, m0 ; t7 t6 593 punpcklqdq m4, m0 ; t4 t5 594 paddsw m0, m3, m2 ; out0 out1 595 psubsw m3, m2 ; out7 out6 596 psubsw m2, m1, m4 ; out4 out5 597 paddsw m1, m4 ; out3 out2 598%endmacro 599 600%macro IADST8_1D_PACKED 1 ; pass 601 vpbroadcastd m6, [o(pd_2048)] 602%if %1 == 1 603 ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a 604 ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a 605 ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a 606 ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a 607 psubsw m4, m0, m2 ; t5 t4 608 paddsw m0, m2 ; t1 t0 609 psubsw m5, m1, m3 ; t6 t7 610 paddsw m1, m3 ; t2 t3 611 ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a 612 ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a 613%if mmsize > 16 614 vbroadcasti32x4 m2, [o(deint_shuf)] 615%else 616 mova m2, [o(deint_shuf)] 617%endif 618 vprord m1, 16 619 psubsw m3, m0, m1 ; t3 t2 620 paddsw m0, m1 ; -out7 out0 621 psubsw m1, m4, m5 ; t7 t6 622 paddsw m4, m5 ; out6 -out1 623 pshufb m0, m2 624 pshufb m4, m2 625 mova m2, m6 626 vpdpwssd m2, m3, [o(pw_m2896_2896)] {bcstd} 627 mova m5, m6 628 vpdpwssd m5, m1, [o(pw_m2896_2896)] {bcstd} 629 psrad m2, 12 630 psrad m5, 12 631 packssdw m2, m5 ; out4 -out5 632 mova m5, m6 633 vpdpwssd m5, m3, [o(pw_2896_2896)] {bcstd} 634 mova m3, m6 635 vpdpwssd m3, m1, [o(pw_2896_2896)] {bcstd} 636 psrad m5, 12 637 psrad m3, 12 638 packssdw m1, m3, m5 ; out2 -out3 639%else 640 punpckhwd m0, m4, m3 ; 0 7 641 punpckhwd m1, m5, m2 ; 2 5 642 punpcklwd m2, m5 ; 4 3 643 punpcklwd m3, m4 ; 6 1 644 ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a 645 ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a 646 ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a 647 ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a 648 psubsw m4, m0, m2 ; t4 t5 649 paddsw m0, m2 ; t0 t1 650 psubsw m5, m1, m3 ; t6 t7 651 paddsw m1, m3 ; t2 t3 652 shufps m2, m5, m4, q1032 653 punpckhwd m4, m2 654 punpcklwd m5, m2 655 ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784 ; t4a t5a 656 ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a 657 psubsw m2, m0, m1 ; t2 t3 658 paddsw m0, m1 ; out0 -out7 659 psubsw m1, m4, m5 ; t6 t7 660 paddsw m4, m5 ; -out1 out6 661 vpbroadcastd m5, [o(pw_2896x8)] 662 punpckhqdq m3, m2, m1 ; t3 t7 663 punpcklqdq m2, m1 ; t2 t6 664 paddsw m1, m2, m3 ; t2+t3 t6+t7 665 psubsw m2, m3 ; t2-t3 t6-t7 666 punpckhqdq m3, m4, m0 ; out6 -out7 667 punpcklqdq m0, m4 ; out0 -out1 668 pmulhrsw m2, m5 ; out4 -out5 669 pshufd m1, m1, q1032 670 pmulhrsw m1, m5 ; out2 -out3 671%endif 672%endmacro 673 674INIT_YMM avx512icl 675INV_TXFM_4X8_FN dct, dct 676INV_TXFM_4X8_FN dct, identity 677INV_TXFM_4X8_FN dct, adst 678INV_TXFM_4X8_FN dct, flipadst 679 680cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 681 vpermq m0, [cq+32*0], q3120 682 vpermq m1, [cq+32*1], q3120 683 vpbroadcastd m2, [o(pw_2896x8)] 684 pmulhrsw m0, m2 685 pmulhrsw m1, m2 686 IDCT4_1D_PACKED 687 vbroadcasti32x4 m2, [o(deint_shuf)] 688 shufps m3, m0, m1, q1331 689 shufps m0, m0, m1, q0220 690 pshufb m0, m2 691 pshufb m1, m3, m2 692 jmp tx2q 693.pass2: 694 vextracti32x4 xm2, m0, 1 695 vextracti32x4 xm3, m1, 1 696 call .main 697 vpbroadcastd m4, [o(pw_2048)] 698 vinserti32x4 m0, m0, xm2, 1 699 vinserti32x4 m1, m1, xm3, 1 700 pshufd m1, m1, q1032 701 jmp m(iadst_4x8_internal_8bpc).end2 702ALIGN function_align 703.main: 704 WRAP_XMM IDCT8_1D_PACKED 705 ret 706 707INV_TXFM_4X8_FN adst, dct 708INV_TXFM_4X8_FN adst, adst 709INV_TXFM_4X8_FN adst, flipadst 710INV_TXFM_4X8_FN adst, identity 711 712cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 713 vpermq m0, [cq+32*0], q3120 714 vpermq m1, [cq+32*1], q3120 715 vpbroadcastd m2, [o(pw_2896x8)] 716 pmulhrsw m0, m2 717 pmulhrsw m1, m2 718 call m(iadst_8x4_internal_8bpc).main 719 punpckhwd m3, m0, m1 720 punpcklwd m0, m1 721 punpckhwd m1, m0, m3 722 punpcklwd m0, m3 723 jmp tx2q 724.pass2: 725 vextracti32x4 xm2, m0, 1 726 vextracti32x4 xm3, m1, 1 727 pshufd xm4, xm0, q1032 728 pshufd xm5, xm1, q1032 729 call .main_pass2 730 vpbroadcastd m4, [o(pw_2048)] 731 vinserti32x4 m0, xm2, 1 732 vinserti32x4 m1, xm3, 1 733 pxor m5, m5 734 psubw m5, m4 735.end: 736 punpcklqdq m4, m5 737.end2: 738 pmulhrsw m0, m4 739 pmulhrsw m1, m4 740.end3: 741 vpbroadcastd m3, strided 742 pmulld m5, m3, [o(pd_0to15)] 743 kxnorb k1, k1, k1 744 kmovb k2, k1 745 vpgatherdd m3{k1}, [dstq+m5] 746 pxor m4, m4 747 mova [cq], zmm20 748 punpcklbw m2, m3, m4 749 punpckhbw m3, m4 750 paddw m0, m2 751 paddw m1, m3 752 packuswb m0, m1 753 vpscatterdd [dstq+m5]{k2}, m0 754 RET 755ALIGN function_align 756.main_pass1: 757 punpckhwd xm0, xm4, xm3 ; 0 7 758 punpckhwd xm1, xm5, xm2 ; 2 5 759 punpcklwd xm2, xm5 ; 4 3 760 punpcklwd xm3, xm4 ; 6 1 761 WRAP_XMM IADST8_1D_PACKED 1 762 punpcklqdq xm3, xm4, xm0 ; out6 -out7 763 punpckhqdq xm0, xm4 ; out0 -out1 764 ret 765ALIGN function_align 766.main_pass2: 767 WRAP_XMM IADST8_1D_PACKED 2 768 ret 769 770INV_TXFM_4X8_FN flipadst, dct 771INV_TXFM_4X8_FN flipadst, adst 772INV_TXFM_4X8_FN flipadst, flipadst 773INV_TXFM_4X8_FN flipadst, identity 774 775cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 776 vpermq m0, [cq+32*0], q3120 777 vpermq m1, [cq+32*1], q3120 778 vpbroadcastd m2, [o(pw_2896x8)] 779 pmulhrsw m0, m2 780 pmulhrsw m1, m2 781 call m(iadst_8x4_internal_8bpc).main 782 punpcklwd m3, m1, m0 783 punpckhwd m1, m0 784 punpcklwd m0, m1, m3 785 punpckhwd m1, m3 786 jmp tx2q 787.pass2: 788 vextracti32x4 xm2, m0, 1 789 vextracti32x4 xm3, m1, 1 790 pshufd xm4, xm0, q1032 791 pshufd xm5, xm1, q1032 792 call m(iadst_4x8_internal_8bpc).main_pass2 793 vpbroadcastd m5, [o(pw_2048)] 794 vinserti32x4 m3, xm1, 1 795 vinserti32x4 m2, xm0, 1 796 pxor m4, m4 797 psubw m4, m5 798 pshufd m0, m3, q1032 799 pshufd m1, m2, q1032 800 jmp m(iadst_4x8_internal_8bpc).end 801 802INIT_ZMM avx512icl 803INV_TXFM_4X8_FN identity, dct 804INV_TXFM_4X8_FN identity, adst 805INV_TXFM_4X8_FN identity, flipadst 806INV_TXFM_4X8_FN identity, identity 807 808cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 809 vpbroadcastd m0, [o(pw_2896x8)] 810 pmulhrsw m0, [cq] 811 mova m1, [o(int8_permB)] 812 vpbroadcastd m2, [o(pw_1697x8)] 813 vpermb m0, m1, m0 814 pmulhrsw m2, m0 815 paddsw m0, m2 816 vextracti32x8 ym1, m0, 1 817 jmp tx2q 818.pass2: 819 vpbroadcastd ym4, [o(pw_4096)] 820 jmp m(iadst_4x8_internal_8bpc).end2 821 822%macro INV_TXFM_4X16_FN 2 ; type1, type2 823 INV_TXFM_FN %1, %2, 4x16 824%ifidn %1_%2, dct_dct 825 movsx r6d, word [cq] 826 mov [cq], eobd 827 imul r6d, 181 828 add r6d, 128+256 829 sar r6d, 8+1 830 imul r6d, 181 831 add r6d, 128+2048 832 sar r6d, 8+4 833 vpbroadcastw m0, r6d 834 mova m1, m0 835 jmp m(iadst_4x16_internal_8bpc).end3 836%endif 837%endmacro 838 839%macro IDCT16_1D_PACKED 0 840 punpckhwd m8, m7, m0 ; dct16 in15 in1 841 punpcklwd m9, m4, m0 ; dct4 in2 in0 842 punpckhwd m0, m3, m4 ; dct16 in7 in9 843 punpcklwd m7, m1 ; dct8 in7 in1 844 punpckhwd m1, m6 ; dct16 in3 in13 845 punpcklwd m3, m5 ; dct8 in3 in5 846 punpckhwd m5, m2 ; dct16 in11 in5 847 punpcklwd m6, m2 ; dct4 in3 in1 848.main2: 849 vpbroadcastd m10, [o(pd_2048)] 850.main3: 851 vpbroadcastq m13, [o(int_mshift)] 852 vpcmpub k7, m13, m10, 6 ; 0x33... 853 ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 5 ; t8a t15a 854 ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 5 ; t9a t14a 855 ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a 856 ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a 857 ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 5 ; t4a t7a 858 ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 5 ; t5a t6a 859.main4: 860 psubsw m2, m8, m0 ; t9 t14 861 paddsw m8, m0 ; t8 t15 862 psubsw m4, m1, m5 ; t10 t13 863 paddsw m1, m5 ; t11 t12 864 ITX_MUL2X_PACK 6, 0, 5, 10, 1567, 3784 ; t3 t2 865 psubsw m0, m8, m1 ; t11a t12a 866 paddsw m8, m1 ; t8a t15a 867 psubsw m1, m7, m3 ; t5a t6a 868 paddsw m7, m3 ; t4 t7 869.main5: 870 ITX_MUL2X_PACK 2, 3, 5, 10, 1567, 3784, 5 ; t9a t14a 871 ITX_MUL2X_PACK 4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a 872%if mmsize > 16 873 vbroadcasti32x4 m5, [o(deint_shuf)] 874%else 875 mova m5, [o(deint_shuf)] 876%endif 877 vpbroadcastd m11, [o(pw_m2896_2896)] 878 vpbroadcastd m12, [o(pw_2896_2896)] 879 paddsw m3, m2, m4 ; t9 t14 880 psubsw m2, m4 ; t10 t13 881 pshufb m8, m5 882 pshufb m7, m5 883 pshufb m3, m5 884 ITX_MUL2X_PACK 9, 4, 5, 10, 11, 12 ; t0 t1 885 ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6 886 ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12 887 ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a 888 punpckhqdq m2, m7, m1 ; t7 t6 889 punpcklqdq m7, m1 ; t4 t5 890 psubsw m1, m9, m6 ; dct4 out3 out2 891 paddsw m9, m6 ; dct4 out0 out1 892 packssdw m5, m11 ; t12 t13a 893 packssdw m4, m0 ; t11 t10a 894 punpckhqdq m0, m8, m3 ; t15a t14 895 punpcklqdq m8, m3 ; t8a t9 896 psubsw m3, m9, m2 ; dct8 out7 out6 897 paddsw m9, m2 ; dct8 out0 out1 898 psubsw m2, m1, m7 ; dct8 out4 out5 899 paddsw m1, m7 ; dct8 out3 out2 900 psubsw m7, m9, m0 ; out15 out14 901 paddsw m0, m9 ; out0 out1 902 psubsw m6, m1, m5 ; out12 out13 903 paddsw m1, m5 ; out3 out2 904 psubsw m5, m2, m4 ; out11 out10 905 paddsw m2, m4 ; out4 out5 906 psubsw m4, m3, m8 ; out8 out9 907 paddsw m3, m8 ; out7 out6 908%endmacro 909 910INV_TXFM_4X16_FN dct, dct 911INV_TXFM_4X16_FN dct, identity 912INV_TXFM_4X16_FN dct, adst 913INV_TXFM_4X16_FN dct, flipadst 914 915cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 916 mova ym1, [cq+32*2] 917 vinserti32x8 m1, [cq+32*0], 1 918 mova m0, [o(int16_perm)] 919 mova ym2, [cq+32*3] 920 vinserti32x8 m2, [cq+32*1], 1 921 vpbroadcastd m4, [o(pd_2048)] 922 vpermb m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3 923 vpermb m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3 924 ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896, 2 925 ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784, 2 926 vpbroadcastd m4, [o(pw_16384)] 927 psubsw m3, m1, m2 928 paddsw m1, m2 ; out0 out1 929 vprord m3, 16 ; out2 out3 930 punpckldq m0, m1, m3 931 punpckhdq m1, m3 932 pmulhrsw m0, m4 933 pmulhrsw m1, m4 934 jmp tx2q 935.pass2: 936 vextracti32x4 xm2, ym0, 1 937 vextracti32x4 xm3, ym1, 1 938 vextracti32x4 xm4, m0, 2 939 vextracti32x4 xm5, m1, 2 940 vextracti32x4 xm6, m0, 3 941 vextracti32x4 xm7, m1, 3 942 call .main 943 vinserti32x4 ym0, xm2, 1 944 vinserti32x4 ym1, xm3, 1 945 vinserti32x4 ym4, xm6, 1 946 vinserti32x4 ym5, xm7, 1 947 vinserti32x8 m0, ym4, 1 948 vinserti32x8 m1, ym5, 1 949 vpbroadcastd m5, [o(pw_2048)] 950 pshufd m1, m1, q1032 951 jmp m(iadst_4x16_internal_8bpc).end2 952ALIGN function_align 953.main: 954 WRAP_XMM IDCT16_1D_PACKED 955 ret 956 957INV_TXFM_4X16_FN adst, dct 958INV_TXFM_4X16_FN adst, adst 959INV_TXFM_4X16_FN adst, flipadst 960INV_TXFM_4X16_FN adst, identity 961 962cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 963 mova m1, [o(permB)] 964 vpermq m0, m1, [cq+64*0] 965 vpermq m1, m1, [cq+64*1] 966 call m(iadst_16x4_internal_8bpc).main 967 vpbroadcastd m3, [o(pw_16384)] 968 punpckhwd m2, m0, m1 969 punpcklwd m0, m1 970 pmulhrsw m2, m3 971 pmulhrsw m0, m3 972 punpckhwd m1, m0, m2 973 punpcklwd m0, m2 974 jmp tx2q 975.pass2: 976 call .main 977 vpbroadcastd m5, [o(pw_2048)] 978 psrlq m10, 4 979 psubw m6, m8, m5 980.end: 981 vpbroadcastd m7, [o(pw_2896x8)] 982 paddsw ym1, ym2, ym4 983 psubsw ym2, ym4 984 vinserti32x8 m1, ym2, 1 985 pmulhrsw m1, m7 ; -out7 out4 out6 -out5 out8 -out11 -out9 out10 986 psrlq m0, m10, 4 987 vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d 988 vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f 989 punpcklqdq m5, m6 990.end2: 991 pmulhrsw m0, m5 992 pmulhrsw m1, m5 993.end3: 994 vpbroadcastd m3, strided 995 pmulld m5, m3, [o(pd_0to15)] 996 kxnorw k1, k1, k1 997 kmovw k2, k1 998 vpgatherdd m3{k1}, [dstq+m5] 999 pxor m4, m4 1000 mova [cq+64*0], m4 1001 mova [cq+64*1], m4 1002 punpcklbw m2, m3, m4 1003 punpckhbw m3, m4 1004 paddw m0, m2 1005 paddw m1, m3 1006 packuswb m0, m1 1007 vpscatterdd [dstq+m5]{k2}, m0 1008 RET 1009ALIGN function_align 1010.main: 1011 movu m3, [o(permB+1)] 1012 psrlq m10, m3, 4 1013.main2: 1014 vpermi2q m3, m0, m1 ; in15 in12 in13 in14 in11 in8 in9 in10 1015 vpermt2q m0, m10, m1 ; in0 in3 in2 in1 in4 in7 in6 in5 1016 vpbroadcastd m9, [o(pd_2048)] 1017 vpbroadcastq ym13, [o(int_mshift)] 1018 kxnorb k1, k1, k1 1019 punpckhwd m4, m3, m0 ; in12 in3 in14 in1 1020 punpcklwd m0, m3 ; in0 in15 in2 in13 1021 kshiftrb k1, k1, 4 1022 vextracti32x8 ym3, m4, 1 ; in8 in7 in10 in5 1023 vextracti32x8 ym1, m0, 1 ; in4 in11 in6 in9 1024INIT_YMM avx512icl 1025 vpcmpub k7, m13, m9, 6 ; 0x33... 1026 pxor m8, m8 1027 ITX_MUL4X_PACK 0, 2, 5, 6, 7, 9, 201, 4091, 995, 3973, 5 1028 ITX_MUL4X_PACK 1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5 1029 ITX_MUL4X_PACK 3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5 1030 ITX_MUL4X_PACK 4, 2, 5, 6, 7, 9, 3857, 1380, 4052, 601, 5 1031 psubsw m2, m0, m3 ; t9a t8a t11a t10a 1032 paddsw m0, m3 ; t1a t0a t3a t2a 1033 psubsw m3, m1, m4 ; t13a t12a t15a t14a 1034 paddsw m4, m1 ; t5a t4a t7a t6a 1035 ITX_MUL4X_PACK 2, 1, 5, 6, 7, 9, 799, 4017, 3406, 2276, 5 1036 psubw m7, m8, m7 1037 ITX_MUL2X_PACK 3, 1, 5, 9, 7, 6, 4 1038 vpbroadcastd m6, [o(pw_3784_m1567)] 1039 vpbroadcastd m6{k1}, [o(pw_m3784_1567)] 1040 psubsw m1, m0, m4 ; t5 t4 t7 t6 1041 paddsw m0, m4 ; t1 t0 t3 t2 1042 psubsw m4, m2, m3 ; t13a t12a t15a t14a 1043 paddsw m2, m3 ; t9a t8a t11a t10a 1044 ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a 1045 ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14 1046 vbroadcasti32x4 m5, [o(deint_shuf)] 1047 pshufb m0, m5 1048 pshufb m2, m5 1049 vshufi32x4 m3, m0, m2, 0x03 ; t3 t2 t11a t10a 1050 vinserti32x4 m0, xm2, 1 ; t1 t0 t9a t8a 1051 vshufi32x4 m2, m1, m4, 0x03 ; t7a t6a t15 t14 1052 vinserti32x4 m1, xm4, 1 ; t4a t5a t12 t13 1053 pshufd m2, m2, q1032 ; t6a t7a t14 t15 1054 psubsw m4, m0, m3 ; t3a t2a t11 t10 1055 paddsw m0, m3 ; -out15 out0 out14 -out1 1056 paddsw m3, m1, m2 ; out12 -out3 -out13 out2 1057 psubsw m1, m2 ; t7 t6 t15a t14a 1058 punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a 1059 punpcklqdq m4, m1 ; t3a t7 t11 t15a 1060INIT_ZMM avx512icl 1061 vinserti32x8 m3, ym0, 1 ; out12 -out3 -out13 out2 -out15 out0 out14 -out1 1062 ret 1063 1064INV_TXFM_4X16_FN flipadst, dct 1065INV_TXFM_4X16_FN flipadst, adst 1066INV_TXFM_4X16_FN flipadst, flipadst 1067INV_TXFM_4X16_FN flipadst, identity 1068 1069cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1070 mova m1, [o(permB)] 1071 vpermq m0, m1, [cq+64*0] 1072 vpermq m1, m1, [cq+64*1] 1073 call m(iadst_16x4_internal_8bpc).main 1074 vpbroadcastd m3, [o(pw_16384)] 1075 punpcklwd m2, m1, m0 1076 punpckhwd m1, m0 1077 pmulhrsw m2, m3 1078 pmulhrsw m1, m3 1079 punpcklwd m0, m1, m2 1080 punpckhwd m1, m2 1081 jmp tx2q 1082.pass2: 1083 call m(iadst_4x16_internal_8bpc).main 1084 vpbroadcastd m6, [o(pw_2048)] 1085 psrlq m10, 12 1086 psubw m5, m8, m6 1087 jmp m(iadst_4x16_internal_8bpc).end 1088 1089INV_TXFM_4X16_FN identity, dct 1090INV_TXFM_4X16_FN identity, adst 1091INV_TXFM_4X16_FN identity, flipadst 1092INV_TXFM_4X16_FN identity, identity 1093 1094cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1095 mova m2, [o(int16_perm)] 1096 vpermb m1, m2, [cq+64*0] 1097 vpermb m2, m2, [cq+64*1] 1098 vpbroadcastd m4, [o(pw_1697x8)] 1099 vpbroadcastd m0, [o(pd_m1)] 1100 pmulhrsw m3, m4, m1 ; we want to do a signed avg, but pavgw is 1101 vpcmpw k1, m1, m0, 4 ; unsigned. as long as both signs are equal 1102 pmulhrsw m4, m2 ; it still works, but if the input is -1 the 1103 vpcmpw k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes 1104 vpavgw m1{k1}{z}, m3 ; pavgw to output -32768 instead of 0 unless 1105 vpavgw m2{k2}{z}, m4 ; we explicitly deal with that case here. 1106 punpckldq m0, m1, m2 1107 punpckhdq m1, m2 1108 jmp tx2q 1109.pass2: 1110 vpbroadcastd m3, [o(pw_1697x16)] 1111 vpbroadcastd m5, [o(pw_2048)] 1112 pmulhrsw m2, m3, m0 1113 pmulhrsw m3, m1 1114 paddsw m0, m0 1115 paddsw m1, m1 1116 paddsw m0, m2 1117 paddsw m1, m3 1118 jmp m(iadst_4x16_internal_8bpc).end2 1119 1120%macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3] 1121 movq xm%3, [dstq ] 1122 movhps xm%3, [dstq+%5] 1123 movq xm%4, [dstq+%6] 1124 movhps xm%4, [dstq+%7] 1125 pmovzxbw m%3, xm%3 1126 pmovzxbw m%4, xm%4 1127%ifnum %1 1128 paddw m%3, m%1 1129%else 1130 paddw m%3, %1 1131%endif 1132%ifnum %2 1133 paddw m%4, m%2 1134%else 1135 paddw m%4, %2 1136%endif 1137 packuswb m%3, m%4 1138 vextracti32x4 xm%4, m%3, 1 1139 movq [dstq ], xm%3 1140 movhps [dstq+%6], xm%3 1141 movq [dstq+%5], xm%4 1142 movhps [dstq+%7], xm%4 1143%endmacro 1144 1145%macro INV_TXFM_8X4_FN 2 ; type1, type2 1146 INV_TXFM_FN %1, %2, 8x4 1147%ifidn %1_%2, dct_dct 1148 movd xm1, [o(pw_2896x8)] 1149 pmulhrsw xm0, xm1, [cq] 1150 movd xm2, [o(pw_2048)] 1151 pmulhrsw xm0, xm1 1152 pmulhrsw xm0, xm1 1153 pmulhrsw xm0, xm2 1154 vpbroadcastw m0, xm0 1155 mova m1, m0 1156 jmp m(iadst_8x4_internal_8bpc).end3 1157%endif 1158%endmacro 1159 1160INIT_YMM avx512icl 1161INV_TXFM_8X4_FN dct, dct 1162INV_TXFM_8X4_FN dct, adst 1163INV_TXFM_8X4_FN dct, flipadst 1164INV_TXFM_8X4_FN dct, identity 1165 1166cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1167 vpbroadcastd xm3, [o(pw_2896x8)] 1168 pmulhrsw xm0, xm3, [cq+16*0] 1169 pmulhrsw xm1, xm3, [cq+16*1] 1170 pmulhrsw xm2, xm3, [cq+16*2] 1171 pmulhrsw xm3, [cq+16*3] 1172 call m(idct_4x8_internal_8bpc).main 1173 vbroadcasti32x4 m4, [o(deint_shuf)] 1174 vinserti32x4 m3, m1, xm3, 1 1175 vinserti32x4 m1, m0, xm2, 1 1176 shufps m0, m1, m3, q0220 1177 shufps m1, m3, q1331 1178 pshufb m0, m4 1179 pshufb m1, m4 1180 jmp tx2q 1181.pass2: 1182 IDCT4_1D_PACKED 1183 vpermq m0, m0, q3120 1184 vpermq m1, m1, q2031 1185 jmp m(iadst_8x4_internal_8bpc).end2 1186 1187INV_TXFM_8X4_FN adst, dct 1188INV_TXFM_8X4_FN adst, adst 1189INV_TXFM_8X4_FN adst, flipadst 1190INV_TXFM_8X4_FN adst, identity 1191 1192cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1193 vpbroadcastd xm0, [o(pw_2896x8)] 1194 pshufd xm4, [cq+16*0], q1032 1195 pmulhrsw xm3, xm0, [cq+16*3] 1196 pshufd xm5, [cq+16*1], q1032 1197 pmulhrsw xm2, xm0, [cq+16*2] 1198 pmulhrsw xm4, xm0 1199 pmulhrsw xm5, xm0 1200 call m(iadst_4x8_internal_8bpc).main_pass1 1201 vinserti32x4 m0, xm2, 1 1202 vinserti32x4 m1, xm3, 1 1203 pxor m3, m3 1204 punpckhwd m2, m0, m1 1205 punpcklwd m0, m1 1206 psubsw m3, m2 1207 punpckhwd m1, m0, m3 1208 punpcklwd m0, m3 1209 jmp tx2q 1210.pass2: 1211 call .main 1212.end: 1213 vpermq m0, m0, q3120 1214 vpermq m1, m1, q3120 1215.end2: 1216 vpbroadcastd m2, [o(pw_2048)] 1217 pmulhrsw m0, m2 1218 pmulhrsw m1, m2 1219.end3: 1220 pxor m2, m2 1221 mova [cq], zmm18 1222 lea r6, [strideq*3] 1223 WRITE_8X4 0, 1, 4, 5 1224 RET 1225ALIGN function_align 1226.main: 1227 IADST4_1D_PACKED 1228 ret 1229 1230INV_TXFM_8X4_FN flipadst, dct 1231INV_TXFM_8X4_FN flipadst, adst 1232INV_TXFM_8X4_FN flipadst, flipadst 1233INV_TXFM_8X4_FN flipadst, identity 1234 1235cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1236 vpbroadcastd xm0, [o(pw_2896x8)] 1237 pshufd xm4, [cq+16*0], q1032 1238 pmulhrsw xm3, xm0, [cq+16*3] 1239 pshufd xm5, [cq+16*1], q1032 1240 pmulhrsw xm2, xm0, [cq+16*2] 1241 pmulhrsw xm4, xm0 1242 pmulhrsw xm5, xm0 1243 call m(iadst_4x8_internal_8bpc).main_pass1 1244 vinserti32x4 m3, m3, xm1, 1 1245 vinserti32x4 m2, m2, xm0, 1 1246 punpckhwd m1, m3, m2 1247 punpcklwd m3, m2 1248 pxor m0, m0 1249 psubsw m0, m1 1250 punpckhwd m1, m0, m3 1251 punpcklwd m0, m3 1252 jmp tx2q 1253.pass2: 1254 call m(iadst_8x4_internal_8bpc).main 1255 mova m2, m1 1256 vpermq m1, m0, q2031 1257 vpermq m0, m2, q2031 1258 jmp m(iadst_8x4_internal_8bpc).end2 1259 1260INV_TXFM_8X4_FN identity, dct 1261INV_TXFM_8X4_FN identity, adst 1262INV_TXFM_8X4_FN identity, flipadst 1263INV_TXFM_8X4_FN identity, identity 1264 1265cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1266 mova xm2, [cq+16*0] 1267 mova xm0, [cq+16*1] 1268 vinserti32x4 m2, [cq+16*2], 1 1269 vinserti32x4 m0, [cq+16*3], 1 1270 vpbroadcastd m3, [o(pw_2896x8)] 1271 punpcklwd m1, m2, m0 1272 punpckhwd m2, m0 1273 pmulhrsw m1, m3 1274 pmulhrsw m2, m3 1275 punpcklwd m0, m1, m2 1276 punpckhwd m1, m2 1277 paddsw m0, m0 1278 paddsw m1, m1 1279 jmp tx2q 1280.pass2: 1281 vpbroadcastd m3, [o(pw_1697x8)] 1282 pmulhrsw m2, m3, m0 1283 pmulhrsw m3, m1 1284 paddsw m0, m2 1285 paddsw m1, m3 1286 jmp m(iadst_8x4_internal_8bpc).end 1287 1288%macro INV_TXFM_8X8_FN 2 ; type1, type2 1289 INV_TXFM_FN %1, %2, 8x8 1290%ifidn %1_%2, dct_dct 1291INIT_ZMM avx512icl 1292 movsx r6d, word [cq] 1293 mov [cq], eobd 1294.dconly: 1295 imul r6d, 181 1296 add r6d, 128+256 1297 sar r6d, 8+1 1298.dconly2: 1299 vpbroadcastd ym2, strided 1300 imul r6d, 181 1301 pmulld ym5, ym2, [o(pd_0to15)] 1302 kxnorb k1, k1, k1 1303 add r6d, 128+2048 1304 sar r6d, 8+4 1305 pxor m3, m3 1306 vpbroadcastw m4, r6d 1307.dconly_loop: 1308 kmovb k2, k1 1309 vpgatherdq m2{k1}, [dstq+ym5] 1310 punpcklbw m0, m2, m3 1311 punpckhbw m1, m2, m3 1312 paddw m0, m4 1313 paddw m1, m4 1314 packuswb m0, m1 1315 kmovb k1, k2 1316 vpscatterdq [dstq+ym5]{k2}, m0 1317 lea dstq, [dstq+strideq*8] 1318 sub r3d, 8 1319 jg .dconly_loop 1320 RET 1321INIT_YMM avx512icl 1322%endif 1323%endmacro 1324 1325INV_TXFM_8X8_FN dct, dct 1326INV_TXFM_8X8_FN dct, identity 1327INV_TXFM_8X8_FN dct, adst 1328INV_TXFM_8X8_FN dct, flipadst 1329 1330cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1331 vpermq m0, [cq+32*0], q3120 ; 0 1 1332 vpermq m3, [cq+32*3], q3120 ; 6 7 1333 vpermq m2, [cq+32*2], q3120 ; 4 5 1334 vpermq m1, [cq+32*1], q3120 ; 2 3 1335 call .main 1336 shufps m4, m0, m1, q0220 1337 shufps m5, m0, m1, q1331 1338 shufps m1, m2, m3, q0220 1339 shufps m3, m2, m3, q1331 1340 vbroadcasti32x4 m0, [o(deint_shuf)] 1341 vpbroadcastd m2, [o(pw_16384)] 1342 REPX {pshufb x, m0}, m4, m5, m1, m3 1343 REPX {pmulhrsw x, m2}, m4, m5, m1, m3 1344 vinserti32x4 m0, m4, xm1, 1 1345 vshufi32x4 m2, m4, m1, 0x03 1346 vinserti32x4 m1, m5, xm3, 1 1347 vshufi32x4 m3, m5, m3, 0x03 1348 jmp tx2q 1349.pass2: 1350 call .main 1351 vpbroadcastd m4, [o(pw_2048)] 1352 vpermq m0, m0, q3120 1353 vpermq m1, m1, q2031 1354 vpermq m2, m2, q3120 1355 vpermq m3, m3, q2031 1356 jmp m(iadst_8x8_internal_8bpc).end2 1357ALIGN function_align 1358.main: 1359 IDCT8_1D_PACKED 1360 ret 1361 1362INV_TXFM_8X8_FN adst, dct 1363INV_TXFM_8X8_FN adst, adst 1364INV_TXFM_8X8_FN adst, flipadst 1365INV_TXFM_8X8_FN adst, identity 1366 1367cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1368 vpermq m4, [cq+32*0], q1302 ; 1 0 1369 vpermq m3, [cq+32*3], q3120 ; 6 7 1370 vpermq m5, [cq+32*1], q1302 ; 3 2 1371 vpermq m2, [cq+32*2], q3120 ; 4 5 1372 call .main_pass1 1373 vpbroadcastd m5, [o(pw_16384_m16384)] 1374 punpcklwd m4, m0, m1 1375 punpckhwd m0, m1 1376 punpcklwd m1, m2, m3 1377 punpckhwd m2, m3 1378 punpcklwd m3, m4, m0 1379 punpckhwd m4, m0 1380 punpcklwd m0, m1, m2 1381 punpckhwd m1, m2 1382 REPX {pmulhrsw x, m5}, m3, m4, m0, m1 1383 vshufi32x4 m2, m3, m0, 0x03 1384 vinserti32x4 m0, m3, xm0, 1 1385 vshufi32x4 m3, m4, m1, 0x03 1386 vinserti32x4 m1, m4, xm1, 1 1387 jmp tx2q 1388.pass2: 1389 pshufd m4, m0, q1032 1390 pshufd m5, m1, q1032 1391 call .main_pass2 1392 vpbroadcastd m5, [o(pw_2048)] 1393 vpbroadcastd xm4, [o(pw_4096)] 1394 psubw m4, m5 ; lower half = 2048, upper half = -2048 1395.end: 1396 REPX {vpermq x, x, q3120}, m0, m1, m2, m3 1397.end2: 1398 pmulhrsw m0, m4 1399 pmulhrsw m1, m4 1400.end3: 1401 pmulhrsw m2, m4 1402 pmulhrsw m3, m4 1403.end4: 1404 pxor m4, m4 1405 mova [cq+32*0], m4 1406 mova [cq+32*1], m4 1407 mova [cq+32*2], m4 1408 mova [cq+32*3], m4 1409 lea r6, [strideq*3] 1410 WRITE_8X4 0, 1, 4, 5 1411 lea dstq, [dstq+strideq*4] 1412 WRITE_8X4 2, 3, 4, 5 1413 RET 1414ALIGN function_align 1415.main_pass1: 1416 punpckhwd m0, m4, m3 ; 0 7 1417 punpckhwd m1, m5, m2 ; 2 5 1418 punpcklwd m2, m5 ; 4 3 1419 punpcklwd m3, m4 ; 6 1 1420 IADST8_1D_PACKED 1 1421 punpcklqdq m3, m4, m0 ; out6 -out7 1422 punpckhqdq m0, m4 ; out0 -out1 1423 ret 1424ALIGN function_align 1425.main_pass2: 1426 IADST8_1D_PACKED 2 1427 ret 1428 1429INV_TXFM_8X8_FN flipadst, dct 1430INV_TXFM_8X8_FN flipadst, adst 1431INV_TXFM_8X8_FN flipadst, flipadst 1432INV_TXFM_8X8_FN flipadst, identity 1433 1434cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1435 vpermq m4, [cq+32*0], q1302 ; 1 0 1436 vpermq m3, [cq+32*3], q3120 ; 6 7 1437 vpermq m5, [cq+32*1], q1302 ; 3 2 1438 vpermq m2, [cq+32*2], q3120 ; 4 5 1439 call m(iadst_8x8_internal_8bpc).main_pass1 1440 vpbroadcastd m5, [o(pw_m16384_16384)] 1441 punpckhwd m4, m3, m2 1442 punpcklwd m3, m2 1443 punpckhwd m2, m1, m0 1444 punpcklwd m1, m0 1445 punpckhwd m0, m4, m3 1446 punpcklwd m4, m3 1447 punpckhwd m3, m2, m1 1448 punpcklwd m2, m1 1449 REPX {pmulhrsw x, m5}, m0, m4, m3, m2 1450 vinserti32x4 m1, m0, xm3, 1 1451 vshufi32x4 m3, m0, m3, 0x03 1452 vinserti32x4 m0, m4, xm2, 1 1453 vshufi32x4 m2, m4, m2, 0x03 1454 jmp tx2q 1455.pass2: 1456 pshufd m4, m0, q1032 1457 pshufd m5, m1, q1032 1458 call m(iadst_8x8_internal_8bpc).main_pass2 1459 vpbroadcastd m4, [o(pw_2048)] 1460 vpbroadcastd xm5, [o(pw_4096)] 1461 psubw m4, m5 ; lower half = -2048, upper half = 2048 1462 vpermq m5, m3, q2031 1463 vpermq m3, m0, q2031 1464 vpermq m0, m2, q2031 1465 vpermq m2, m1, q2031 1466 pmulhrsw m1, m0, m4 1467 pmulhrsw m0, m5, m4 1468 jmp m(iadst_8x8_internal_8bpc).end3 1469 1470INV_TXFM_8X8_FN identity, dct 1471INV_TXFM_8X8_FN identity, adst 1472INV_TXFM_8X8_FN identity, flipadst 1473INV_TXFM_8X8_FN identity, identity 1474 1475cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1476 mova xm3, [cq+16*0] 1477 mova xm2, [cq+16*1] 1478 vinserti32x4 m3, [cq+16*4], 1 1479 vinserti32x4 m2, [cq+16*5], 1 1480 mova xm4, [cq+16*2] 1481 mova xm0, [cq+16*3] 1482 vinserti32x4 m4, [cq+16*6], 1 1483 vinserti32x4 m0, [cq+16*7], 1 1484 punpcklwd m1, m3, m2 1485 punpckhwd m3, m2 1486 punpcklwd m2, m4, m0 1487 punpckhwd m4, m0 1488 punpckldq m0, m1, m2 1489 punpckhdq m1, m2 1490 punpckldq m2, m3, m4 1491 punpckhdq m3, m4 1492 jmp tx2q 1493.pass2: 1494 vpbroadcastd m4, [o(pw_4096)] 1495 jmp m(iadst_8x8_internal_8bpc).end 1496 1497%macro INV_TXFM_8X16_FN 2 ; type1, type2 1498 INV_TXFM_FN %1, %2, 8x16 1499%ifidn %1_%2, dct_dct 1500 movsx r6d, word [cq] 1501 mov [cq], eobd 1502 imul r6d, 181 1503 mov r3d, 16 1504 add r6d, 128 1505 sar r6d, 8 1506 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly 1507%endif 1508%endmacro 1509 1510%macro ITX_8X16_LOAD_COEFS 0 1511 vpbroadcastd m4, [o(pw_2896x8)] 1512 pmulhrsw m0, m4, [cq+32*0] 1513 add cq, 32*4 1514 pmulhrsw m7, m4, [cq+32*3] 1515 pmulhrsw m1, m4, [cq-32*3] 1516 pmulhrsw m6, m4, [cq+32*2] 1517 pmulhrsw m2, m4, [cq-32*2] 1518 pmulhrsw m5, m4, [cq+32*1] 1519 pmulhrsw m3, m4, [cq-32*1] 1520 pmulhrsw m4, [cq+32*0] 1521%endmacro 1522 1523INIT_ZMM avx512icl 1524INV_TXFM_8X16_FN dct, dct 1525INV_TXFM_8X16_FN dct, identity 1526INV_TXFM_8X16_FN dct, adst 1527INV_TXFM_8X16_FN dct, flipadst 1528 1529cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1530 mova m3, [o(permB)] 1531 vpermq m0, m3, [cq+64*0] 1532 vpbroadcastd m4, [o(pw_2896x8)] 1533 vpermq m1, m3, [cq+64*1] 1534 vpermq m2, m3, [cq+64*2] 1535 vpermq m3, m3, [cq+64*3] 1536 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 1537 call m(idct_16x8_internal_8bpc).main 1538 vpbroadcastd m5, [o(pw_16384)] 1539 punpckhwd m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3 1540 punpcklwd m0, m2 ; a0 e0 a1 e1 a2 e2 a3 e3 1541 punpckhwd m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3 1542 punpcklwd m1, m3 ; d0 h0 d1 h1 d2 h2 d3 h3 1543 REPX {pmulhrsw x, m5}, m4, m0, m2, m1 1544 punpckhwd m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3 1545 punpcklwd m0, m4 ; a0 b0 e0 f0 a1 b1 e1 f1 1546 punpckhwd m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3 1547 punpcklwd m2, m1 ; c0 d0 g0 h0 c1 d1 g1 h1 1548 punpckhdq m1, m0, m2 ; 1 5 9 13 1549 punpckldq m0, m2 ; 0 4 8 12 1550 punpckldq m2, m3, m4 ; 2 6 10 14 1551 punpckhdq m3, m4 ; 3 7 11 15 1552 jmp tx2q 1553.pass2: 1554 vprord m5, [o(int16_perm)], 16 1555 vshufi32x4 m2, m2, q1320 ; 2 10 14 6 1556 vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11 1557 vshufi32x4 m1, m3, q0132 ; 9 13 7 3 1558 vpermb m9, m5, m0 1559 vpermb m7, m5, m2 1560 vpermb m8, m5, m4 1561 vpermb m0, m5, m1 1562 vextracti32x8 ym6, m9, 1 1563 vextracti32x8 ym3, m7, 1 1564 vextracti32x8 ym5, m8, 1 1565 vextracti32x8 ym1, m0, 1 1566 call .main2 1567 mova ym8, [o(gather8a)] 1568 lea r3, [dstq+strideq*4] 1569 pmovzxdq m9, ym8 1570 pshufd ym8, ym8, q1230 1571 vpermt2q m0, m9, m4 1572 vpermt2q m1, m9, m5 1573 vpermt2q m2, m9, m6 1574 vpermt2q m3, m9, m7 1575.end: 1576 vpbroadcastd m7, [o(pw_2048)] 1577.end2: 1578 pmulhrsw m0, m7 1579 pmulhrsw m1, m7 1580.end3: 1581 pmulhrsw m2, m7 1582 pmulhrsw m3, m7 1583.end4: 1584 vpbroadcastd ym6, strided 1585 kxnorb k1, k1, k1 1586 pxor m4, m4 1587 pmulld ym8, ym6 1588 kmovb k2, k1 1589 vpgatherdq m6{k1}, [dstq+ym8] 1590 kmovb k1, k2 1591 vpgatherdq m7{k2}, [r3+ym8] 1592 mova [cq+64*0], m4 1593 mova [cq+64*1], m4 1594 kmovb k2, k1 1595 mova [cq+64*2], m4 1596 mova [cq+64*3], m4 1597 punpcklbw m5, m6, m4 1598 punpckhbw m6, m4 1599 paddw m0, m5 1600 paddw m1, m6 1601 packuswb m0, m1 1602 vpscatterdq [dstq+ym8]{k1}, m0 1603 punpcklbw m6, m7, m4 1604 punpckhbw m7, m4 1605 paddw m2, m6 1606 paddw m3, m7 1607 packuswb m2, m3 1608 vpscatterdq [r3+ym8]{k2}, m2 1609 RET 1610ALIGN function_align 1611.main: 1612 WRAP_YMM IDCT16_1D_PACKED 1613 ret 1614 1615INV_TXFM_8X16_FN adst, dct 1616INV_TXFM_8X16_FN adst, adst 1617INV_TXFM_8X16_FN adst, flipadst 1618INV_TXFM_8X16_FN adst, identity 1619 1620cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1621 call m(iadst_16x8_internal_8bpc).main_pass1 1622 vbroadcasti32x4 m6, [o(int_shuf1)] 1623 vpbroadcastd m7, [o(pw_16384_m16384)] 1624 punpckhwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 1625 punpcklwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 1626 pshufb m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 1627 pshufb m2, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 1628.pass1_end: 1629 REPX {pmulhrsw x, m7}, m3, m5, m4, m2 1630 punpckldq m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1 1631 punpckhdq m3, m5 ; a2 b2 c2 d2 a3 b3 c3 d3 1632 punpckhdq m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 1633 punpckldq m2, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 1634 punpckhqdq m1, m0, m2 1635 punpcklqdq m0, m2 1636 punpcklqdq m2, m3, m5 1637 punpckhqdq m3, m5 1638 jmp tx2q 1639.pass2: 1640 call .main_pass2 1641 vpbroadcastd m6, [o(pw_2048)] 1642 psrlq m10, 4 1643 psubw m7, m8, m6 1644.pass2_end: 1645 vpbroadcastd m5, [o(pw_2896x8)] 1646 paddsw m1, m2, m4 1647 psubsw m2, m4 1648 pmulhrsw m1, m5 ; -out7 out4 out6 -out5 1649 pmulhrsw m5, m2 ; out8 -out11 -out9 out10 1650 mova ym8, [o(gather8c)] 1651 lea r3, [dstq+strideq] 1652 psrlq m2, m10, 4 1653 vpermi2q m2, m0, m3 ; 1 3 13 15 1654 vpermt2q m0, m10, m3 ; 0 2 12 14 1655 psrlq m3, m10, 8 1656 vpermi2q m3, m1, m5 ; 5 7 9 11 1657 psrlq m10, 12 1658 vpermt2q m1, m10, m5 ; 4 6 8 10 1659 pmulhrsw m0, m6 1660 pmulhrsw m1, m6 1661 jmp m(idct_8x16_internal_8bpc).end3 1662ALIGN function_align 1663.main_pass1: 1664 vpbroadcastd m2, [o(pw_2896x8)] 1665 pmulhrsw m5, m2, [cq+64*0] 1666 pmulhrsw m3, m2, [cq+64*3] 1667 pmulhrsw m1, m2, [cq+64*1] 1668 pmulhrsw m2, [cq+64*2] 1669 movu m4, [o(permA+3)] 1670 psrlq m10, m4, 4 1671 mova m6, m4 1672 vpermi2q m4, m5, m3 ; in0 in12 in2 in14 1673 vpermt2q m5, m10, m3 ; in15 in3 in13 in1 1674 vpermi2q m6, m1, m2 ; in4 in8 in6 in10 1675 vpermt2q m1, m10, m2 ; in11 in7 in9 in5 1676 jmp .main 1677ALIGN function_align 1678.main_pass2: 1679 mova m4, [o(permC)] 1680 psrlq m5, m4, 4 1681 vpermi2q m4, m0, m2 ; in0 in12 in2 in14 1682 psrlq m6, m5, 4 1683 vpermi2q m5, m1, m3 ; in15 in3 in13 in1 1684 psrlq m10, m6, 4 1685 vpermi2q m6, m0, m2 ; in4 in8 in6 in10 1686 vpermt2q m1, m10, m3 ; in11 in7 in9 in5 1687.main: 1688 vpbroadcastd m9, [o(pd_2048)] 1689 vpbroadcastq m13, [o(int_mshift)] 1690 kxnorb k1, k1, k1 1691 punpcklwd m0, m4, m5 ; in0 in15 in2 in13 1692 punpckhwd m4, m5 ; in12 in3 in14 in1 1693 punpcklwd m5, m6, m1 ; in4 in11 in6 in9 1694 punpckhwd m6, m1 ; in8 in7 in10 in5 1695 vpcmpub k7, m13, m9, 6 ; 0x33... 1696 pxor m8, m8 1697 ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5 1698 ITX_MUL4X_PACK 6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5 1699 ITX_MUL4X_PACK 4, 1, 2, 3, 7, 9, 3857, 1380, 4052, 601, 5 1700 ITX_MUL4X_PACK 5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5 1701 psubsw m2, m0, m6 ; t9a t8a t11a t10a 1702 paddsw m0, m6 ; t1a t0a t3a t2a 1703 psubsw m3, m5, m4 ; t13a t12a t15a t14a 1704 paddsw m5, m4 ; t5a t4a t7a t6a 1705 ITX_MUL4X_PACK 2, 4, 1, 6, 7, 9, 799, 4017, 3406, 2276, 5 1706 psubw m7, m8, m7 1707 ITX_MUL2X_PACK 3, 4, 1, 9, 7, 6, 4 1708 vpbroadcastd m6, [o(pw_3784_m1567)] 1709 vpbroadcastd m6{k1}, [o(pw_m3784_1567)] 1710 psubsw m1, m0, m5 ; t5 t4 t7 t6 1711 paddsw m0, m5 ; t1 t0 t3 t2 1712 psubsw m4, m2, m3 ; t13a t12a t15a t14a 1713 paddsw m2, m3 ; t9a t8a t11a t10a 1714 ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a 1715 ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15 1716 vbroadcasti32x4 m5, [o(deint_shuf)] 1717 pshufb m0, m5 1718 pshufb m2, m5 1719 vshufi32x4 m3, m0, m2, q3232 ; t3 t2 t11a t10a 1720 vinserti32x8 m0, ym2, 1 ; t1 t0 t9a t8a 1721 vshufi32x4 m2, m1, m4, q3232 ; t6a t7a t14 t15 1722 vinserti32x8 m1, ym4, 1 ; t5a t4a t13 t12 1723 pshufd m2, m2, q1032 ; t7a t6a t15 t14 1724 psubsw m4, m0, m3 ; t3a t2a t11 t10 1725 paddsw m0, m3 ; -out15 out0 out14 -out1 1726 paddsw m3, m1, m2 ; out12 -out3 -out13 out2 1727 psubsw m1, m2 ; t7 t6 t15a t14a 1728 punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a 1729 punpcklqdq m4, m1 ; t3a t7 t11 t15a 1730 ret 1731 1732INV_TXFM_8X16_FN flipadst, dct 1733INV_TXFM_8X16_FN flipadst, adst 1734INV_TXFM_8X16_FN flipadst, flipadst 1735INV_TXFM_8X16_FN flipadst, identity 1736 1737cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1738 call m(iadst_16x8_internal_8bpc).main_pass1 1739 vbroadcasti32x4 m6, [o(int_shuf2)] 1740 vpbroadcastd m7, [o(pw_m16384_16384)] 1741 punpcklwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 1742 punpckhwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 1743 pshufb m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 1744 pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 1745 jmp m(iadst_8x16_internal_8bpc).pass1_end 1746.pass2: 1747 call m(iadst_8x16_internal_8bpc).main_pass2 1748 vpbroadcastd m7, [o(pw_2048)] 1749 psrlq m10, 36 1750 psubw m6, m8, m7 1751 jmp m(iadst_8x16_internal_8bpc).pass2_end 1752 1753INV_TXFM_8X16_FN identity, dct 1754INV_TXFM_8X16_FN identity, adst 1755INV_TXFM_8X16_FN identity, flipadst 1756INV_TXFM_8X16_FN identity, identity 1757 1758cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1759 mova m0, [o(int16_perm)] 1760 vpermb m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 1761 vpermb m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 1762 vpermb m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 1763 vpermb m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 1764 vpbroadcastd m5, [o(pw_2896x8)] 1765 punpckldq m1, m3, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 1766 punpckhdq m3, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 1767 punpckldq m2, m4, m0 ; e0 f0 g0 h0 a1 f1 g1 h1 1768 punpckhdq m4, m0 ; e2 f2 g2 h2 e3 f3 g3 h3 1769 REPX {pmulhrsw x, m5}, m1, m2, m3, m4 1770 punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 1771 punpckhqdq m1, m2 ; a1 b1 c1 d1 e1 f1 g1 h1 1772 punpcklqdq m2, m3, m4 ; a2 b2 c2 d2 e2 f2 g2 h2 1773 punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3 1774 jmp tx2q 1775.pass2: 1776 vpbroadcastd m7, [o(pw_1697x16)] 1777 mova ym8, [o(gather8b)] 1778 lea r3, [dstq+strideq*2] 1779 pmulhrsw m4, m7, m0 1780 pmulhrsw m5, m7, m1 1781 pmulhrsw m6, m7, m2 1782 pmulhrsw m7, m3 1783 REPX {paddsw x, x}, m0, m1, m2, m3 1784 paddsw m0, m4 1785 paddsw m1, m5 1786 paddsw m2, m6 1787 paddsw m3, m7 1788 jmp m(idct_8x16_internal_8bpc).end 1789 1790%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] 1791 pmovzxbw m%3, [dstq+%5] 1792%ifnum %1 1793 paddw m%3, m%1 1794%else 1795 paddw m%3, %1 1796%endif 1797 pmovzxbw m%4, [dstq+%6] 1798%ifnum %2 1799 paddw m%4, m%2 1800%else 1801 paddw m%4, %2 1802%endif 1803 packuswb m%3, m%4 1804 vpermq m%3, m%3, q3120 1805 mova [dstq+%5], xm%3 1806 vextracti32x4 [dstq+%6], m%3, 1 1807%endmacro 1808 1809%macro INV_TXFM_16X4_FN 2 ; type1, type2 1810 INV_TXFM_FN %1, %2, 16x4 1811%ifidn %1_%2, dct_dct 1812 movsx r6d, word [cq] 1813 mov [cq], eobd 1814 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2 1815%endif 1816%endmacro 1817 1818INIT_ZMM avx512icl 1819INV_TXFM_16X4_FN dct, dct 1820INV_TXFM_16X4_FN dct, adst 1821INV_TXFM_16X4_FN dct, flipadst 1822INV_TXFM_16X4_FN dct, identity 1823 1824cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1825 mova xm0, [cq+16*0] 1826 mova xm1, [cq+16*1] 1827 mova xm2, [cq+16*2] 1828 mova xm3, [cq+16*3] 1829 mova xm4, [cq+16*4] 1830 mova xm5, [cq+16*5] 1831 mova xm6, [cq+16*6] 1832 mova xm7, [cq+16*7] 1833 call m(idct_4x16_internal_8bpc).main 1834 vpbroadcastd m8, [o(pw_16384)] 1835 vinserti32x4 ym1, xm3, 1 ; 3 2 7 6 1836 vinserti32x4 ym5, xm7, 1 ; b a f e 1837 vinserti32x4 ym0, xm2, 1 ; 0 1 4 5 1838 vinserti32x4 ym4, xm6, 1 ; 8 9 c d 1839 vinserti32x8 m1, ym5, 1 ; 3 2 7 6 b a f e 1840 vinserti32x8 m0, ym4, 1 ; 0 1 4 5 8 9 c d 1841 pmulhrsw m1, m8 1842 pmulhrsw m0, m8 1843 pshufd m1, m1, q1032 1844 punpckhwd m2, m0, m1 1845 punpcklwd m0, m1 1846 punpckhwd m1, m0, m2 1847 punpcklwd m0, m2 1848 jmp tx2q 1849.pass2: 1850 IDCT4_1D_PACKED 1851 mova m2, [o(permA)] 1852 jmp m(iadst_16x4_internal_8bpc).end 1853 1854INV_TXFM_16X4_FN adst, dct 1855INV_TXFM_16X4_FN adst, adst 1856INV_TXFM_16X4_FN adst, flipadst 1857INV_TXFM_16X4_FN adst, identity 1858 1859cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1860 mova m0, [cq+64*0] 1861 mova m1, [cq+64*1] 1862 movshdup m3, [o(permB)] 1863 psrlq m10, m3, 4 1864 call m(iadst_4x16_internal_8bpc).main2 1865 vpbroadcastd m6, [o(pw_16384_m16384)] 1866 psrlq m0, m10, 4 1867 psrlq m10, 8 1868.pass1_end: 1869 punpcklwd ym5, ym4, ym2 1870 punpckhwd ym4, ym2 1871 vinserti32x8 m5, ym4, 1 1872 mova m1, m9 1873 vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} 1874 mova m4, m9 1875 vpdpwssd m4, m5, [o(pw_2896_2896)] {1to16} 1876 psrad m1, 12 1877 psrad m4, 12 1878 packssdw m1, m4 ; out8 -out7 -out9 out6 -out11 out4 out10 -out5 1879 vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d 1880 vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f 1881 punpckhwd m2, m0, m1 1882 punpcklwd m0, m1 1883 punpckhwd m1, m0, m2 1884 punpcklwd m0, m2 1885 pmulhrsw m0, m6 1886 pmulhrsw m1, m6 1887 jmp tx2q 1888.pass2: 1889 call .main 1890 movu m2, [o(permA+1)] 1891.end: 1892 vpbroadcastd m3, [o(pw_2048)] 1893 pmulhrsw m0, m3 1894 pmulhrsw m1, m3 1895.end2: 1896 psrlq m3, m2, 4 1897 vpermi2q m2, m0, m1 1898 vpermi2q m3, m0, m1 1899.end3: 1900 lea r3, [dstq+strideq*2] 1901 mova xm1, [dstq+strideq*0] 1902 vinserti32x4 ym1, [dstq+strideq*1], 1 1903 vinserti32x4 m1, [r3 +strideq*0], 2 1904 vinserti32x4 m1, [r3 +strideq*1], 3 1905 pxor m4, m4 1906 mova [cq+64*0], m4 1907 mova [cq+64*1], m4 1908 punpcklbw m0, m1, m4 1909 punpckhbw m1, m4 1910 paddw m0, m2 1911 paddw m1, m3 1912 packuswb m0, m1 1913 mova [dstq+strideq*0], xm0 1914 vextracti32x4 [dstq+strideq*1], ym0, 1 1915 vextracti32x4 [r3 +strideq*0], m0, 2 1916 vextracti32x4 [r3 +strideq*1], m0, 3 1917 RET 1918ALIGN function_align 1919.main: 1920 IADST4_1D_PACKED 1921 ret 1922 1923INV_TXFM_16X4_FN flipadst, dct 1924INV_TXFM_16X4_FN flipadst, adst 1925INV_TXFM_16X4_FN flipadst, flipadst 1926INV_TXFM_16X4_FN flipadst, identity 1927 1928cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1929 mova m0, [cq+64*0] 1930 mova m1, [cq+64*1] 1931 movshdup m3, [o(permB)] 1932 psrlq m10, m3, 4 1933 call m(iadst_4x16_internal_8bpc).main2 1934 vpbroadcastd m6, [o(pw_m16384_16384)] 1935 psrlq m0, m10, 12 1936 psrlq m10, 16 1937 jmp m(iadst_16x4_internal_8bpc).pass1_end 1938.pass2: 1939 call m(iadst_16x4_internal_8bpc).main 1940 movu m2, [o(permA+2)] 1941 jmp m(iadst_16x4_internal_8bpc).end 1942 1943INV_TXFM_16X4_FN identity, dct 1944INV_TXFM_16X4_FN identity, adst 1945INV_TXFM_16X4_FN identity, flipadst 1946INV_TXFM_16X4_FN identity, identity 1947 1948cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1949 mova m1, [cq+64*0] 1950 mova m2, [cq+64*1] 1951 vpbroadcastd m3, [o(pw_1697x16)] 1952 vpbroadcastd m4, [o(pw_16384)] 1953 mova m5, [o(idtx_16x4p)] 1954 shufps m0, m1, m2, q2020 1955 shufps m1, m2, q3131 1956 pmulhrsw m2, m3, m0 1957 pmulhrsw m3, m1 1958 pmulhrsw m2, m4 1959 pmulhrsw m3, m4 1960 paddsw m0, m2 1961 paddsw m1, m3 1962 vpermb m0, m5, m0 1963 vpermb m1, m5, m1 1964 jmp tx2q 1965.pass2: 1966 vpbroadcastd m3, [o(pw_1697x8)] 1967 pmulhrsw m2, m3, m0 1968 pmulhrsw m3, m1 1969 paddsw m0, m2 1970 paddsw m1, m3 1971 movu m2, [o(permA+1)] 1972 jmp m(iadst_16x4_internal_8bpc).end 1973 1974%macro INV_TXFM_16X8_FN 2 ; type1, type2 1975 INV_TXFM_FN %1, %2, 16x8 1976%ifidn %1_%2, dct_dct 1977 movsx r6d, word [cq] 1978 mov [cq], eobd 1979 mov r3d, 8 1980.dconly: 1981 imul r6d, 181 1982 add r6d, 128 1983 sar r6d, 8 1984.dconly2: 1985 imul r6d, 181 1986 add r6d, 128+256 1987 sar r6d, 8+1 1988.dconly3: 1989 imul r6d, 181 1990 lea r2, [strideq*3] 1991 add r6d, 128+2048 1992 sar r6d, 8+4 1993 pxor m2, m2 1994 vpbroadcastw m3, r6d 1995.dconly_loop: 1996 mova xm1, [dstq+strideq*0] 1997 vinserti32x4 ym1, [dstq+strideq*1], 1 1998 vinserti32x4 m1, [dstq+strideq*2], 2 1999 vinserti32x4 m1, [dstq+r2 ], 3 2000 punpcklbw m0, m1, m2 2001 punpckhbw m1, m2 2002 paddw m0, m3 2003 paddw m1, m3 2004 packuswb m0, m1 2005 mova [dstq+strideq*0], xm0 2006 vextracti32x4 [dstq+strideq*1], ym0, 1 2007 vextracti32x4 [dstq+strideq*2], m0, 2 2008 vextracti32x4 [dstq+r2 ], m0, 3 2009 lea dstq, [dstq+strideq*4] 2010 sub r3d, 4 2011 jg .dconly_loop 2012 RET 2013%endif 2014%endmacro 2015 2016%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd 2017 vpbroadcastd m8, [o(pw_2896x8)] 2018 vpermq m0, [cq+32*0], q3120 2019 add cq, 32*4 2020 vpermq m7, [cq+32*3], q%1 2021 vpermq m1, [cq-32*3], q%1 2022 vpermq m6, [cq+32*2], q3120 2023 vpermq m2, [cq-32*2], q3120 2024 vpermq m5, [cq+32*1], q%1 2025 vpermq m3, [cq-32*1], q%1 2026 vpermq m4, [cq+32*0], q3120 2027 REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 2028%endmacro 2029 2030INV_TXFM_16X8_FN dct, dct 2031INV_TXFM_16X8_FN dct, identity 2032INV_TXFM_16X8_FN dct, adst 2033INV_TXFM_16X8_FN dct, flipadst 2034 2035cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2036 vpbroadcastd m1, [o(pw_2896x8)] 2037 vpermq m0, [cq+64*0], q3120 2038 vpermq m2, [cq+64*1], q3120 2039 vpermq m4, [cq+64*2], q3120 2040 vpermq m6, [cq+64*3], q3120 2041 REPX {pmulhrsw x, m1}, m0, m2, m4, m6 2042 vextracti32x8 ym1, m0, 1 2043 vextracti32x8 ym3, m2, 1 2044 vextracti32x8 ym5, m4, 1 2045 vextracti32x8 ym7, m6, 1 2046 call m(idct_8x16_internal_8bpc).main 2047 vbroadcasti32x4 m8, [o(int_shuf1)] 2048 vbroadcasti32x4 m9, [o(int_shuf2)] 2049 vinserti32x8 m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3 2050 vinserti32x8 m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3 2051 vinserti32x8 m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3 2052 vinserti32x8 m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3 2053 vpbroadcastd m2, [o(pw_16384)] 2054 pshufb m0, m8 ; a0 b0 a1 b1 a2 b2 a3 b3 2055 pshufb m1, m9 ; c0 d0 c1 d1 c2 d2 c3 d3 2056 pshufb m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3 2057 pshufb m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3 2058 REPX {pmulhrsw x, m2}, m0, m1, m6, m7 2059 punpckldq m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 2060 punpckhdq m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3 2061 punpckldq m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1 2062 punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3 2063 jmp tx2q 2064.pass2: 2065 vshufi32x4 m0, m2, m4, q2020 ; 0 1 2066 vshufi32x4 m2, m4, q3131 ; 4 5 2067 vshufi32x4 m1, m3, m5, q2020 ; 2 3 2068 vshufi32x4 m3, m5, q3131 ; 6 7 2069 call .main 2070 movshdup m4, [o(permC)] 2071 psrlq m6, m4, 4 2072 vpermq m5, m4, q1032 2073 vpermi2q m4, m0, m2 ; a2 a3 b2 b3 e2 e3 f2 f3 2074 vpermt2q m0, m6, m2 ; a0 a1 b0 b1 e0 e1 f0 f1 2075 psrlq m6, m5, 4 2076 vpermi2q m5, m1, m3 ; c2 c3 d2 d3 g2 g3 h2 h3 2077 vpermt2q m1, m6, m3 ; c0 c1 d0 d1 g0 g1 h0 h1 2078 vpbroadcastd m6, [o(pw_2048)] 2079.end: 2080 REPX {pmulhrsw x, m6}, m0, m4, m1, m5 2081.end2: 2082 lea r3, [dstq+strideq*4] 2083 lea r4, [strideq*3] 2084 mova xm3, [dstq+strideq*0] 2085 mova xm6, [dstq+strideq*2] 2086 vinserti32x4 ym3, [dstq+strideq*1], 1 2087 vinserti32x4 ym6, [dstq+r4 ], 1 2088 vinserti32x4 m3, [r3 +strideq*0], 2 2089 vinserti32x4 m6, [r3 +strideq*2], 2 2090 vinserti32x4 m3, [r3 +strideq*1], 3 2091 vinserti32x4 m6, [r3 +r4 ], 3 2092 pxor m7, m7 2093 mova [cq+64*0], m7 2094 mova [cq+64*1], m7 2095 mova [cq+64*2], m7 2096 mova [cq+64*3], m7 2097 punpcklbw m2, m3, m7 2098 punpckhbw m3, m7 2099 paddw m0, m2 2100 paddw m4, m3 2101 packuswb m0, m4 2102 mova [dstq+strideq*0], xm0 2103 vextracti32x4 [dstq+strideq*1], ym0, 1 2104 vextracti32x4 [r3 +strideq*0], m0, 2 2105 vextracti32x4 [r3 +strideq*1], m0, 3 2106 punpcklbw m3, m6, m7 2107 punpckhbw m6, m7 2108 paddw m1, m3 2109 paddw m5, m6 2110 packuswb m1, m5 2111 mova [dstq+strideq*2], xm1 2112 vextracti32x4 [dstq+r4 ], ym1, 1 2113 vextracti32x4 [r3 +strideq*2], m1, 2 2114 vextracti32x4 [r3 +r4 ], m1, 3 2115 RET 2116ALIGN function_align 2117.main: 2118 IDCT8_1D_PACKED 2119 ret 2120 2121INV_TXFM_16X8_FN adst, dct 2122INV_TXFM_16X8_FN adst, adst 2123INV_TXFM_16X8_FN adst, flipadst 2124INV_TXFM_16X8_FN adst, identity 2125 2126cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2127 call m(iadst_8x16_internal_8bpc).main_pass1 2128 vpbroadcastd m7, [o(pw_16384_m16384)] 2129 psrlq m10, 4 2130.pass1_end: 2131 punpcklwd m5, m4, m2 2132 punpckhwd m4, m2 2133 mova m1, m9 2134 vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} 2135 mova m6, m9 2136 vpdpwssd m6, m5, [o(pw_2896_2896)] {1to16} 2137 mova m2, m9 2138 vpdpwssd m2, m4, [o(pw_m2896_2896)] {1to16} 2139 vpdpwssd m9, m4, [o(pw_2896_2896)] {1to16} 2140 psrad m1, 12 2141 psrad m6, 12 2142 packssdw m1, m6 ; out8 -out7 -out9 out6 2143 psrad m2, 12 2144 psrad m9, 12 2145 packssdw m2, m9 ; -out11 out4 out10 -out5 2146 psrlq m4, m10, 4 2147 vpermi2q m4, m0, m2 2148 vpermt2q m0, m10, m2 2149 psrlq m5, m10, 8 2150 vpermi2q m5, m1, m3 2151 psrlq m10, 12 2152 vpermt2q m1, m10, m3 2153 punpcklwd m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3 2154 punpckhwd m4, m5 ; b0 d0 b1 d1 b2 d2 b3 d3 2155 punpcklwd m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3 2156 punpckhwd m1, m0 ; j0 l0 j1 l1 j2 l2 j3 l3 2157 punpcklwd m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1 2158 punpckhwd m3, m4 ; a2 b2 c2 d2 a3 b3 c3 d3 2159 punpcklwd m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1 2160 punpckhwd m5, m1 ; i2 j2 k2 l2 i3 j3 k3 l3 2161 REPX {pmulhrsw x, m7}, m2, m3, m4, m5 2162 jmp tx2q 2163.pass2: 2164 vshufi32x4 m0, m2, m4, q2020 2165 vshufi32x4 m2, m4, q3131 ; 4 5 2166 vshufi32x4 m1, m3, m5, q2020 2167 vshufi32x4 m3, m5, q3131 ; 6 7 2168 pshufd m4, m0, q1032 ; 1 0 2169 pshufd m5, m1, q1032 ; 3 2 2170 call .main_pass2 2171 pmulhrsw m0, m6 2172 pmulhrsw m1, m6 2173 psrlq m6, m4, 4 2174 mova m5, m4 2175 vpermi2q m4, m0, m2 2176 vpermt2q m0, m6, m2 2177 vpermi2q m5, m1, m3 2178 vpermt2q m1, m6, m3 2179 jmp m(idct_16x8_internal_8bpc).end2 2180ALIGN function_align 2181.main_pass1: 2182 vpbroadcastd m4, [o(pw_2896x8)] 2183 pmulhrsw m3, m4, [cq+64*0] 2184 pmulhrsw m1, m4, [cq+64*3] 2185 pmulhrsw m2, m4, [cq+64*1] 2186 pmulhrsw m4, [cq+64*2] 2187 mova m5, [o(int16_perm)] 2188 kxnorb k1, k1, k1 2189 vpblendmd m0{k1}, m1, m3 ; 0 7 2190 vmovdqa32 m3{k1}, m1 ; 6 1 2191 vpblendmd m1{k1}, m4, m2 ; 2 5 2192 vmovdqa32 m2{k1}, m4 ; 4 3 2193 REPX {vpermb x, m5, x}, m0, m1, m2, m3 2194 IADST8_1D_PACKED 1 2195 ret 2196ALIGN function_align 2197.main_pass2: 2198 IADST8_1D_PACKED 2 2199 movshdup m4, [o(permC)] 2200 pxor m5, m5 2201 psubd m5, m6 2202 packssdw m6, m5 2203 pmulhrsw m2, m6 2204 pmulhrsw m3, m6 2205 ret 2206 2207INV_TXFM_16X8_FN flipadst, dct 2208INV_TXFM_16X8_FN flipadst, adst 2209INV_TXFM_16X8_FN flipadst, flipadst 2210INV_TXFM_16X8_FN flipadst, identity 2211 2212cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2213 call m(iadst_8x16_internal_8bpc).main_pass1 2214 vpbroadcastd m7, [o(pw_m16384_16384)] 2215 psrlq m10, 20 2216 jmp m(iadst_16x8_internal_8bpc).pass1_end 2217.pass2: 2218 vshufi32x4 m0, m2, m4, q2020 2219 vshufi32x4 m2, m4, q3131 ; 4 5 2220 vshufi32x4 m1, m3, m5, q2020 2221 vshufi32x4 m3, m5, q3131 ; 6 7 2222 pshufd m4, m0, q1032 ; 1 0 2223 pshufd m5, m1, q1032 ; 3 2 2224 call m(iadst_16x8_internal_8bpc).main_pass2 2225 pmulhrsw m5, m6, m0 2226 pmulhrsw m0, m6, m1 2227 psrlq m1, m4, 12 2228 psrlq m4, 8 2229 mova m7, m4 2230 vpermi2q m4, m0, m3 2231 vpermt2q m0, m1, m3 2232 vpermi2q m1, m5, m2 2233 vpermt2q m5, m7, m2 2234 jmp m(idct_16x8_internal_8bpc).end2 2235 2236INV_TXFM_16X8_FN identity, dct 2237INV_TXFM_16X8_FN identity, adst 2238INV_TXFM_16X8_FN identity, flipadst 2239INV_TXFM_16X8_FN identity, identity 2240 2241cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2242 vpbroadcastd m0, [o(pw_2896x8)] 2243 pmulhrsw m3, m0, [cq+64*0] 2244 pmulhrsw m4, m0, [cq+64*1] 2245 pmulhrsw m5, m0, [cq+64*2] 2246 pmulhrsw m0, [cq+64*3] 2247 vpbroadcastd m7, [o(pw_1697x16)] 2248 vpbroadcastd m8, [o(pw_16384)] 2249 shufps m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5 2250 shufps m3, m4, q3131 ; a2 a3 a6 a7 e2 e3 e6 e7 2251 shufps m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5 2252 shufps m5, m0, q3131 ; i2 i3 i6 i7 m2 m3 m6 m7 2253 mova m9, [o(int8_permA)] 2254 pmulhrsw m0, m7, m2 2255 pmulhrsw m1, m7, m3 2256 pmulhrsw m6, m7, m4 2257 pmulhrsw m7, m5 2258 REPX {pmulhrsw x, m8}, m0, m1, m6, m7 2259 paddsw m2, m0 2260 paddsw m3, m1 2261 paddsw m4, m6 2262 paddsw m5, m7 2263 REPX {vpermb x, m9, x}, m2, m3, m4, m5 2264 jmp tx2q 2265.pass2: 2266 mova m7, [o(permB)] 2267 vpbroadcastd m6, [o(pw_4096)] 2268 vpermq m0, m7, m2 2269 vpermq m4, m7, m4 2270 vpermq m1, m7, m3 2271 vpermq m5, m7, m5 2272 jmp m(idct_16x8_internal_8bpc).end 2273 2274%macro INV_TXFM_16X16_FN 2 ; type1, type2 2275 INV_TXFM_FN %1, %2, 16x16 2276%ifidn %1_%2, dct_dct 2277 movsx r6d, word [cq] 2278 mov [cq], eobd 2279 imul r6d, 181 2280 mov r3d, 16 2281 add r6d, 128+512 2282 sar r6d, 8+2 2283 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 2284%endif 2285%endmacro 2286 2287INV_TXFM_16X16_FN dct, dct 2288INV_TXFM_16X16_FN dct, identity 2289INV_TXFM_16X16_FN dct, adst 2290INV_TXFM_16X16_FN dct, flipadst 2291 2292cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2293 mova m7, [o(permB)] 2294 vpermq m0, m7, [cq+64*0] 2295 vpermq m1, m7, [cq+64*1] 2296 vpermq m2, m7, [cq+64*2] 2297 vpermq m3, m7, [cq+64*3] 2298 vpermq m4, m7, [cq+64*4] 2299 vpermq m5, m7, [cq+64*5] 2300 vpermq m6, m7, [cq+64*6] 2301 vpermq m7, m7, [cq+64*7] 2302 call .main 2303 vbroadcasti32x4 m12, [o(int_shuf1)] 2304 vbroadcasti32x4 m11, [o(int_shuf2)] 2305 vpbroadcastd m13, [o(pw_8192)] 2306 pshufb m0, m12 2307 pshufb m8, m1, m11 2308 pshufb m2, m12 2309 pshufb m9, m3, m11 2310 pshufb m4, m12 2311 pshufb m10, m5, m11 2312 pshufb m6, m12 2313 pshufb m11, m7, m11 2314 REPX {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11 2315 punpckhdq m1, m0, m8 2316 punpckldq m0, m8 2317 punpckhdq m3, m2, m9 2318 punpckldq m2, m9 2319 punpckhdq m5, m4, m10 2320 punpckldq m4, m10 2321 punpckhdq m7, m6, m11 2322 punpckldq m6, m11 2323 jmp tx2q 2324.pass2: 2325 vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc 2326 vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 2327 vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec 2328 vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 2329 vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me 2330 vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 2331 vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee 2332 vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 2333 vshufi32x4 m2, m0, m4, q3131 ; 4 5 2334 vshufi32x4 m0, m4, q2020 ; 0 1 2335 vshufi32x4 m4, m6, m8, q2020 ; 8 9 2336 vshufi32x4 m6, m8, q3131 ; 12 13 2337 vshufi32x4 m3, m1, m5, q3131 ; 6 7 2338 vshufi32x4 m1, m5, q2020 ; 2 3 2339 vshufi32x4 m5, m7, m9, q2020 ; 10 11 2340 vshufi32x4 m7, m9, q3131 ; 14 15 2341 call .main 2342 mova m8, [o(permD)] 2343 psrlq m12, m8, 4 2344 psrlq m9, m8, 8 2345 psrlq m13, m8, 12 2346 mova m10, m8 2347 vpermi2q m8, m0, m2 ; 0 1 4 5 2348 vpermt2q m0, m12, m2 2349 mova m11, m9 2350 vpermi2q m9, m1, m3 ; 2 3 6 7 2351 vpermt2q m1, m13, m3 2352 vpermi2q m10, m4, m6 ; 8 9 12 13 2353 vpermt2q m4, m12, m6 2354 vpermi2q m11, m5, m7 ; 10 11 14 15 2355 vpermt2q m5, m13, m7 2356.end: 2357 vpbroadcastd m12, [o(pw_2048)] 2358.end2: 2359 REPX {pmulhrsw x, m12}, m0, m1, m4, m5 2360.end3: 2361 REPX {pmulhrsw x, m12}, m8, m9, m10, m11 2362 lea r3, [strideq*3] 2363 lea r4, [dstq+strideq*4] 2364 lea r5, [dstq+strideq*8] 2365 lea r6, [r4 +strideq*8] 2366 mova xm3, [dstq+strideq*0] 2367 mova xm6, [dstq+strideq*2] 2368 vinserti32x4 ym3, [dstq+strideq*1], 1 2369 vinserti32x4 ym6, [dstq+r3 ], 1 2370 vinserti32x4 m3, [r4+strideq*0], 2 2371 vinserti32x4 m6, [r4+strideq*2], 2 2372 vinserti32x4 m3, [r4+strideq*1], 3 2373 vinserti32x4 m6, [r4+r3 ], 3 2374 mova xm12, [r5+strideq*0] 2375 mova xm13, [r5+strideq*2] 2376 vinserti32x4 ym12, [r5+strideq*1], 1 2377 vinserti32x4 ym13, [r5+r3 ], 1 2378 vinserti32x4 m12, [r6+strideq*0], 2 2379 vinserti32x4 m13, [r6+strideq*2], 2 2380 vinserti32x4 m12, [r6+strideq*1], 3 2381 vinserti32x4 m13, [r6+r3 ], 3 2382 pxor m7, m7 2383 REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 2384 punpcklbw m2, m3, m7 2385 punpckhbw m3, m7 2386 paddw m0, m2 2387 paddw m8, m3 2388 packuswb m0, m8 2389 punpcklbw m2, m6, m7 2390 punpckhbw m6, m7 2391 paddw m1, m2 2392 paddw m9, m6 2393 packuswb m1, m9 2394 punpcklbw m2, m12, m7 2395 punpckhbw m12, m7 2396 paddw m2, m4 2397 paddw m10, m12 2398 packuswb m2, m10 2399 punpcklbw m3, m13, m7 2400 punpckhbw m13, m7 2401 paddw m3, m5 2402 paddw m11, m13 2403 packuswb m3, m11 2404 mova [dstq+strideq*0], xm0 2405 vextracti32x4 [dstq+strideq*1], ym0, 1 2406 mova [dstq+strideq*2], xm1 2407 vextracti32x4 [dstq+r3 ], ym1, 1 2408 vextracti32x4 [r4+strideq*0], m0, 2 2409 vextracti32x4 [r4+strideq*1], m0, 3 2410 vextracti32x4 [r4+strideq*2], m1, 2 2411 vextracti32x4 [r4+r3 ], m1, 3 2412 mova [r5+strideq*0], xm2 2413 vextracti32x4 [r5+strideq*1], ym2, 1 2414 mova [r5+strideq*2], xm3 2415 vextracti32x4 [r5+r3 ], ym3, 1 2416 vextracti32x4 [r6+strideq*0], m2, 2 2417 vextracti32x4 [r6+strideq*1], m2, 3 2418 vextracti32x4 [r6+strideq*2], m3, 2 2419 vextracti32x4 [r6+r3 ], m3, 3 2420 RET 2421ALIGN function_align 2422.main_fast2: ; bottom three-quarters are zero 2423 vpbroadcastd m10, [o(pd_2048)] 2424 vpbroadcastq m13, [o(int_mshift)] 2425 vpcmpub k7, m13, m10, 6 2426.main_fast4: 2427 vpbroadcastd m2, [o(pw_401_4076x8)] 2428 vpbroadcastd m4, [o(pw_m1189_3920x8)] 2429 vpbroadcastd m3, [o(pw_799_4017x8)] 2430 pmulhrsw m2, m8 ; t8a t15a 2431 pmulhrsw m4, m1 ; t11a t12a 2432 pmulhrsw m7, m3 ; t4a t7a 2433 pxor m6, m6 2434 psubsw m0, m2, m4 ; t11a t12a 2435 paddsw m8, m2, m4 ; t8a t15a 2436 mova m1, m7 2437 jmp .main5 2438ALIGN function_align 2439.main_fast: ; bottom half is zero 2440 vpbroadcastd m10, [o(pd_2048)] 2441.main_fast3: 2442 vpbroadcastq m13, [o(int_mshift)] 2443 vpcmpub k7, m13, m10, 6 2444.main_fast5: 2445 vpbroadcastd m2, [o(pw_401_4076x8)] 2446 vpbroadcastd m4, [o(pw_m2598_3166x8)] 2447 vpbroadcastd m11, [o(pw_1931_3612x8)] 2448 vpbroadcastd m12, [o(pw_m1189_3920x8)] 2449 pmulhrsw m8, m2 ; t8a t15a 2450 vpbroadcastd m2, [o(pw_799_4017x8)] 2451 pmulhrsw m0, m4 ; t9a t14a 2452 vpbroadcastd m4, [o(pw_m2276_3406x8)] 2453 pmulhrsw m5, m11 ; t10a t13a 2454 pmulhrsw m1, m12 ; t11a t12a 2455 pmulhrsw m7, m2 ; t4a t7a 2456 pmulhrsw m3, m4 ; t5a t6a 2457 jmp .main4 2458ALIGN function_align 2459.main: 2460 IDCT16_1D_PACKED 2461 ret 2462 2463INV_TXFM_16X16_FN adst, dct 2464INV_TXFM_16X16_FN adst, adst 2465INV_TXFM_16X16_FN adst, flipadst 2466 2467cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2468 call .main_pass1 2469 vpbroadcastd m10, [o(pw_8192_m8192)] 2470 punpcklwd m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3 2471 punpckhwd m0, m1 ; a0 c0 a1 c1 a2 c2 a3 c3 2472 punpckhwd m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3 2473 punpcklwd m0, m8 ; a0 b0 c0 d0 a1 b1 c1 d1 2474 punpcklwd m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3 2475 punpckhwd m2, m3 ; e0 g0 e1 g1 e2 g2 e3 g3 2476 punpckhwd m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3 2477 punpcklwd m2, m8 ; e0 f0 g0 h0 e1 f1 g1 h1 2478 punpckhwd m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3 2479 punpcklwd m4, m5 ; j0 l0 j1 l1 j2 l2 j3 l3 2480 punpckhwd m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 2481 punpcklwd m4, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 2482 punpckhwd m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3 2483 punpcklwd m6, m7 ; n0 p0 n1 p1 n2 p2 n3 p3 2484 punpckhwd m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 2485 punpcklwd m6, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 2486.pass1_end: 2487 REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 2488 jmp tx2q 2489.pass2: 2490 call .main_pass2 2491 mova m10, [o(permD)] 2492 psrlq m8, m10, 8 2493 psrlq m12, m10, 12 2494 psrlq m13, m10, 4 2495 mova m9, m8 2496 vpermi2q m8, m0, m2 ; 0 1 4 5 2497 vpermt2q m0, m12, m2 2498 vpermi2q m9, m1, m3 ; 2 3 6 7 2499 vpermt2q m1, m12, m3 2500 vpbroadcastd m12, [o(pw_2048)] 2501 mov r3d, 0xff00ff00 2502 mova m11, m10 2503 vpermi2q m10, m4, m6 ; 8 9 12 13 2504 vpermt2q m4, m13, m6 2505 kmovd k1, r3d 2506 vpermi2q m11, m5, m7 ; 10 11 14 15 2507 vpermt2q m5, m13, m7 2508 pxor m7, m7 2509 vpsubw m12{k1}, m7, m12 2510 jmp m(idct_16x16_internal_8bpc).end2 2511ALIGN function_align 2512.main_pass1: 2513 mova m4, [o(permB)] 2514 psrlq m3, m4, 4 2515 vpermq m0, m4, [cq+64*0] 2516 vpermq m7, m3, [cq+64*7] 2517 vpermq m6, m4, [cq+64*6] 2518 vpermq m1, m3, [cq+64*1] 2519 vpermq m2, m4, [cq+64*2] 2520 vpermq m5, m3, [cq+64*5] 2521 vpermq m4, m4, [cq+64*4] 2522 vpermq m3, m3, [cq+64*3] 2523 call .main 2524 vpbroadcastd m13, [o(pw_2896_2896)] 2525 vpbroadcastd m12, [o(pw_m2896_2896)] 2526 mova m2, m10 2527 vpdpwssd m2, m5, m13 ; -out5 2528 mova m8, m10 2529 vpdpwssd m8, m11, m13 ; out4 2530 mova m9, m10 2531 vpdpwssd m9, m5, m12 ; out10 2532 mova m5, m10 2533 vpdpwssd m5, m11, m12 ; -out11 2534 mova m11, m10 2535 vpdpwssd m11, m3, m13 ; -out7 2536 mova m14, m10 2537 vpdpwssd m14, m4, m13 ; out6 2538 mova m13, m10 2539 vpdpwssd m13, m3, m12 ; out8 2540 vpdpwssd m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9 2541 REPX {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10 2542 packssdw m2, m8 ; -out5 out4 2543 packssdw m5, m9, m5 ; out10 -out11 2544 packssdw m3, m11, m14 ; -out7 out6 2545 packssdw m4, m13, m10 ; out8 -out9 2546 ret 2547ALIGN function_align 2548.main_pass2: 2549 vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc 2550 vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 2551 vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec 2552 vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 2553 vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me 2554 vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 2555 vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee 2556 vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 2557 vshufi32x4 m2, m0, m4, q3131 ; 4 5 2558 vshufi32x4 m0, m4, q2020 ; 0 1 2559 vshufi32x4 m4, m6, m8, q2020 ; 8 9 2560 vshufi32x4 m6, m8, q3131 ; 12 13 2561 vshufi32x4 m3, m1, m5, q3131 ; 6 7 2562 vshufi32x4 m1, m5, q2020 ; 2 3 2563 vshufi32x4 m5, m7, m9, q2020 ; 10 11 2564 vshufi32x4 m7, m9, q3131 ; 14 15 2565 REPX {pshufd x, x, q1032}, m1, m3, m5, m7 2566 call .main 2567 vpbroadcastd m8, [o(pw_2896x8)] 2568 pshufb m2, m11, m12 2569 pshufb m5, m12 2570 pshufb m3, m12 2571 pshufb m4, m12 2572 punpcklqdq m9, m5, m2 ; t15a t7 2573 punpckhqdq m5, m2 ; t14a t6 2574 shufps m2, m3, m4, q1032 ; t2a t10 2575 shufps m3, m4, q3210 ; t3a t11 2576 psubsw m4, m2, m3 ; out8 -out9 2577 paddsw m3, m2 ; -out7 out6 2578 paddsw m2, m5, m9 ; -out5 out4 2579 psubsw m5, m9 ; out10 -out11 2580 REPX {pmulhrsw x, m8}, m2, m3, m4, m5 2581 ret 2582ALIGN function_align 2583.main: 2584 vpbroadcastd m10, [o(pd_2048)] 2585 vpbroadcastq m13, [o(int_mshift)] 2586 punpckhwd m8, m7, m0 ; in14 in1 2587 punpcklwd m0, m7 ; in0 in15 2588 punpcklwd m7, m6, m1 ; in12 in3 2589 punpckhwd m1, m6 ; in2 in13 2590 punpckhwd m6, m5, m2 ; in10 in5 2591 punpcklwd m2, m5 ; in4 in11 2592 punpcklwd m5, m4, m3 ; in8 in7 2593 punpckhwd m3, m4 ; in6 in9 2594 vpcmpub k7, m13, m10, 6 ; 0x33... 2595 ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 5 ; t0 t1 2596 ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 5 ; t2 t3 2597 ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 5 ; t4 t5 2598 ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 5 ; t6 t7 2599 ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 5 ; t8 t9 2600 ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 5 ; t10 t11 2601 ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 5 ; t12 t13 2602 ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 5 ; t14 t15 2603 psubsw m4, m0, m5 ; t9a t8a 2604 paddsw m0, m5 ; t1a t0a 2605 psubsw m5, m1, m6 ; t11a t10a 2606 paddsw m1, m6 ; t3a t2a 2607 psubsw m6, m2, m7 ; t13a t12a 2608 paddsw m2, m7 ; t5a t4a 2609 psubsw m7, m3, m8 ; t15a t14a 2610 paddsw m3, m8 ; t7a t6a 2611 ITX_MUL2X_PACK 4, 8, 9, 10, 799, 4017, 4 ; t8 t9 2612 ITX_MUL2X_PACK 6, 8, 9, 10, 799_4017, 4017_m799, 52 ; t12 t13 2613 ITX_MUL2X_PACK 5, 8, 9, 10, 3406, 2276, 4 ; t10 t11 2614 ITX_MUL2X_PACK 7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15 2615 psubsw m8, m1, m3 ; t7 t6 2616 paddsw m1, m3 ; t3 t2 2617 psubsw m3, m0, m2 ; t5 t4 2618 paddsw m0, m2 ; t1 t0 2619 psubsw m2, m5, m7 ; t14a t15a 2620 paddsw m7, m5 ; t10a t11a 2621 psubsw m5, m4, m6 ; t12a t13a 2622 paddsw m4, m6 ; t8a t9a 2623 ITX_MUL2X_PACK 3, 6, 9, 10, 1567, 3784, 5 ; t5a t4a 2624 ITX_MUL2X_PACK 8, 6, 9, 10, 3784_m1567, 1567_3784, 52 ; t7a t6a 2625 ITX_MUL2X_PACK 2, 6, 9, 10, 3784, 1567, 4 ; t15 t14 2626 ITX_MUL2X_PACK 5, 6, 9, 10, 3784_1567, 1567_m3784, 52 ; t13 t12 2627 vbroadcasti32x4 m12, [o(deint_shuf)] 2628 paddsw m6, m4, m7 ; -out1 out14 2629 psubsw m4, m7 ; t10 t11 2630 psubsw m11, m3, m8 ; t7 t6 2631 paddsw m8, m3 ; out12 -out3 2632 psubsw m3, m0, m1 ; t3a t2a 2633 paddsw m0, m1 ; -out15 out0 2634 paddsw m1, m2, m5 ; -out13 out2 2635 psubsw m5, m2 ; t15a t14a 2636 pshufb m0, m12 2637 pshufb m6, m12 2638 pshufb m8, m12 2639 pshufb m1, m12 2640 shufps m7, m6, m0, q1032 ; out14 -out15 2641 shufps m0, m6, m0, q3210 ; -out1 out0 2642 punpcklqdq m6, m8, m1 ; out12 -out13 2643 punpckhqdq m1, m8, m1 ; -out3 out2 2644 ret 2645 2646INV_TXFM_16X16_FN flipadst, dct 2647INV_TXFM_16X16_FN flipadst, adst 2648INV_TXFM_16X16_FN flipadst, flipadst 2649 2650cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2651 call m(iadst_16x16_internal_8bpc).main_pass1 2652 vpbroadcastd m10, [o(pw_m8192_8192)] 2653 punpcklwd m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3 2654 punpckhwd m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3 2655 punpckhwd m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3 2656 punpcklwd m7, m6 ; b0 d0 b1 d1 b2 d2 b3 d3 2657 punpcklwd m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1 2658 punpckhwd m1, m7 ; a2 b2 c2 d2 a3 b3 c3 d3 2659 punpcklwd m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1 2660 punpckhwd m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3 2661 punpcklwd m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3 2662 punpckhwd m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3 2663 punpckhwd m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3 2664 punpcklwd m5, m4 ; f0 h0 f1 h1 f2 h2 f3 h3 2665 punpcklwd m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1 2666 punpckhwd m3, m5 ; e2 f2 g2 h2 e3 f3 g3 h3 2667 punpcklwd m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1 2668 punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3 2669 jmp m(iadst_16x16_internal_8bpc).pass1_end 2670.pass2: 2671 call m(iadst_16x16_internal_8bpc).main_pass2 2672 mova m10, [o(permD)] 2673 psrlq m8, m10, 8 2674 psrlq m12, m10, 12 2675 psrlq m13, m10, 4 2676 mova m9, m8 2677 vpermi2q m8, m7, m5 ; 0 1 4 5 2678 vpermt2q m7, m12, m5 2679 vpermi2q m9, m6, m4 ; 2 3 6 7 2680 vpermt2q m6, m12, m4 2681 vpbroadcastd m12, [o(pw_2048)] 2682 mov r3d, 0x00ff00ff 2683 mova m11, m10 2684 vpermi2q m10, m3, m1 ; 8 9 12 13 2685 vpermt2q m3, m13, m1 2686 kmovd k1, r3d 2687 vpermi2q m11, m2, m0 ; 10 11 14 15 2688 vpermt2q m2, m13, m0 2689 pxor m0, m0 2690 vpsubw m12{k1}, m0, m12 2691 pmulhrsw m0, m7, m12 2692 pmulhrsw m1, m6, m12 2693 pmulhrsw m4, m3, m12 2694 pmulhrsw m5, m2, m12 2695 jmp m(idct_16x16_internal_8bpc).end3 2696 2697INV_TXFM_16X16_FN identity, dct 2698INV_TXFM_16X16_FN identity, identity 2699 2700cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2701 mova m8, [o(int16_perm)] 2702 vpermb m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 2703 vpermb m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 2704 vpbroadcastd m0, [o(pw_1697x16)] 2705 vpermb m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 2706 vpermb m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 2707 vpermb m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3 2708 vpermb m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3 2709 vpermb m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3 2710 vpermb m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3 2711 pmulhrsw m9, m0, m1 2712 pmulhrsw m10, m0, m2 2713 pmulhrsw m11, m0, m3 2714 pmulhrsw m12, m0, m4 2715 pmulhrsw m13, m0, m5 2716 pmulhrsw m14, m0, m6 2717 pmulhrsw m15, m0, m7 2718 pmulhrsw m0, m8 2719 REPX {psraw x, 1}, m9, m10, m11, m12 2720 pavgw m1, m9 2721 pavgw m2, m10 2722 pavgw m3, m11 2723 pavgw m4, m12 2724 REPX {psraw x, 1}, m13, m14, m15, m0 2725 pavgw m5, m13 2726 pavgw m6, m14 2727 pavgw m7, m15 2728 pavgw m8, m0 2729 punpckldq m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 2730 punpckhdq m1, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 2731 punpckldq m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 2732 punpckhdq m3, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 2733 punpckldq m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1 2734 punpckhdq m5, m6 ; i2 j2 k2 l2 i3 j3 k3 l3 2735 punpckldq m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 2736 punpckhdq m7, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 2737 jmp tx2q 2738ALIGN function_align 2739.pass2: 2740 vpbroadcastd m11, [o(pw_1697x16)] 2741 pmulhrsw m12, m11, m0 2742 pmulhrsw m13, m11, m1 2743 pmulhrsw m14, m11, m2 2744 pmulhrsw m15, m11, m3 2745 pmulhrsw m8, m11, m4 2746 pmulhrsw m9, m11, m5 2747 pmulhrsw m10, m11, m6 2748 pmulhrsw m11, m7 2749 REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 2750 paddsw m0, m12 2751 paddsw m1, m13 2752 paddsw m2, m14 2753 paddsw m3, m15 2754 paddsw m8, m4 2755 movu m4, [o(permD+2)] 2756 paddsw m9, m5 2757 paddsw m6, m10 2758 paddsw m7, m11 2759 psrlq m12, m4, 4 2760 mova m5, m4 2761 mova m10, m4 2762 mova m11, m4 2763 vpermi2q m4, m0, m2 ; 8 9 12 13 2764 vpermt2q m0, m12, m2 ; 0 1 4 5 2765 vpermi2q m5, m1, m3 ; 10 11 14 15 2766 vpermt2q m1, m12, m3 ; 2 3 6 7 2767 vpermi2q m10, m8, m6 2768 vpermt2q m8, m12, m6 2769 vpermi2q m11, m9, m7 2770 vpermt2q m9, m12, m7 2771 jmp m(idct_16x16_internal_8bpc).end 2772 2773%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] 2774 vpbroadcastd m%3, [o(pw_%4_%5x8)] 2775 punpcklwd m%1, m%2, m%2 2776 pmulhrsw m%1, m%3 2777 vpbroadcastd m%3, [o(pw_%6_%7x8)] 2778 punpckhwd m%2, m%2 2779 pmulhrsw m%2, m%3 2780%endmacro 2781 2782cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob 2783%undef cmp 2784 lea r5, [o_base] 2785 test eobd, eobd 2786 jz .dconly 2787 cmp eobd, 107 2788 jb .fast 2789 mova m5, [cq+64*5] 2790 mova m3, [cq+64*3] 2791 mova m1, [cq+64*1] 2792 mova m7, [cq+64*7] 2793 mova m2, [cq+64*2] 2794 mova m6, [cq+64*6] 2795 mova m0, [cq+64*0] 2796 mova m4, [cq+64*4] 2797 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 2798 mova m8, [o(idct_8x32p)] 2799 vpbroadcastd m9, [o(pw_8192)] 2800 REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7 2801 punpckldq m8, m0, m1 ; ab 2802 punpckhdq m0, m1 2803 punpckldq m1, m2, m3 ; cd 2804 punpckhdq m2, m3 2805 punpckldq m3, m4, m5 ; ef 2806 punpckhdq m4, m5 2807 punpckldq m5, m6, m7 ; gh 2808 punpckhdq m6, m7 2809 REPX {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6 2810 punpcklqdq m18, m8, m1 ; 30 2 6 26 31 1 23 9 2811 punpckhqdq m14, m8, m1 ; 16 0 12 20 3 29 11 21 2812 punpcklqdq m21, m0, m2 ; 14 18 22 10 27 5 19 13 2813 punpckhqdq m15, m0, m2 ; 18 4 24 8 7 25 15 17 2814 punpcklqdq m20, m3, m5 2815 punpckhqdq m16, m3, m5 2816 punpcklqdq m19, m4, m6 2817 punpckhqdq m17, m4, m6 2818 vinserti32x4 ym8, ym18, xm20, 1 2819 vshufi32x4 ym1, ym18, ym20, 0x03 2820 vinserti32x4 ym9, ym14, xm16, 1 2821 vshufi32x4 ym3, ym14, ym16, 0x03 2822 vinserti32x4 ym0, ym21, xm19, 1 2823 vshufi32x4 ym5, ym21, ym19, 0x03 2824 vinserti32x4 ym7, ym15, xm17, 1 2825 vshufi32x4 ym6, ym15, ym17, 0x03 2826 call m(idct_8x16_internal_8bpc).main2 2827 psrlq m12, [o(permB)], 60 2828 vpermt2q m14, m12, m16 2829 vpermt2q m21, m12, m19 2830 vpermt2q m15, m12, m17 2831 vpermi2q m12, m18, m20 2832 vextracti32x8 ym16, m14, 1 2833 vextracti32x8 ym19, m21, 1 2834 vextracti32x8 ym17, m15, 1 2835 vextracti32x8 ym20, m12, 1 2836 call .main2 2837 jmp .end 2838.fast: ; right half is zero 2839 mova m0, [o(int16_perm)] 2840 mova ym2, [cq+64*4] 2841 vinserti32x8 m2, [cq+64*0], 1 2842 mova ym3, [cq+64*6] 2843 vinserti32x8 m3, [cq+64*2], 1 2844 mova ym4, [cq+64*3] 2845 vinserti32x8 m4, [cq+64*5], 1 2846 mova ym5, [cq+64*7] 2847 vinserti32x8 m5, [cq+64*1], 1 2848 REPX {vpermb x, m0, x}, m2, m3, m4, m5 2849 call m(idct_16x8_internal_8bpc).main2 2850 vbroadcasti32x4 m4, [o(int_shuf3)] 2851 vbroadcasti32x4 m5, [o(int_shuf4)] 2852 pshufb m2, m4 ; e0 f0 e2 f2 e1 f1 e3 f3 2853 pshufb m3, m5 ; g0 h0 g2 h2 g1 h1 g3 h3 2854 pshufb m0, m4 ; a0 b0 a2 b2 a1 b1 a3 b3 2855 pshufb m1, m5 ; c0 d0 c2 d2 c1 d1 c3 d3 2856 vpbroadcastd m4, [o(pw_8192)] 2857 psrlq m5, [o(permB)], 60 2858 punpckldq m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2 2859 punpckhdq m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3 2860 punpckldq m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2 2861 punpckhdq m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3 2862 REPX {pmulhrsw x, m4}, m6, m17, m2, m16 2863 vinserti32x4 ym0, ym2, xm6, 1 ; 0 2 2864 vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6 2865 vinserti32x4 ym14, ym16, xm17, 1 ; 1 3 2866 vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7 2867 pxor ym4, ym4 2868 vpermt2q m2, m5, m6 ; 8 10 2869 vpermt2q m16, m5, m17 ; 9 11 2870 mova ym5, ym4 2871 mova ym6, ym4 2872 mova ym7, ym4 2873 vextracti32x8 ym3, m2, 1 ; 12 14 2874 vextracti32x8 ym17, m16, 1 ; 13 15 2875 call m(idct_8x16_internal_8bpc).main 2876 call .main_fast 2877.end: 2878 vpbroadcastd ym12, strided 2879 vpbroadcastd m13, [o(pw_2048)] 2880 pmulld ym7, ym12, [o(gather8d)] 2881 REPX {pmulhrsw x, m13}, m0, m1, m2, m3, m8, m9, m10, m11 2882 lea r3, [dstq+strideq*4] 2883 shl strideq, 4 2884 lea r4, [dstq+strideq] 2885 add r1, r3 2886 kxnorb k1, k1, k1 2887 pxor m6, m6 2888 kmovb k2, k1 2889 vpgatherdq m12{k1}, [r0+ym7] 2890 kmovb k1, k2 2891 vpgatherdq m13{k2}, [r3+ym7] 2892 kmovb k2, k1 2893 vpgatherdq m14{k1}, [r4+ym7] 2894 kmovb k1, k2 2895 vpgatherdq m15{k2}, [r1+ym7] 2896 REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 2897 punpcklbw m4, m12, m6 2898 punpckhbw m12, m6 2899 paddw m0, m4 2900 paddw m1, m12 2901 packuswb m0, m1 2902 kmovb k2, k1 2903 vpscatterdq [r0+ym7]{k1}, m0 2904 punpcklbw m4, m13, m6 2905 punpckhbw m13, m6 2906 paddw m2, m4 2907 paddw m3, m13 2908 packuswb m2, m3 2909 kmovb k1, k2 2910 vpscatterdq [r3+ym7]{k2}, m2 2911 punpcklbw m4, m14, m6 2912 punpckhbw m14, m6 2913 paddw m8, m4 2914 paddw m9, m14 2915 packuswb m8, m9 2916 kmovb k2, k1 2917 vpscatterdq [r4+ym7]{k1}, m8 2918 punpcklbw m4, m15, m6 2919 punpckhbw m15, m6 2920 paddw m10, m4 2921 paddw m11, m15 2922 packuswb m10, m11 2923 vpscatterdq [r1+ym7]{k2}, m10 2924 RET 2925.dconly: 2926 movsx r6d, word [cq] 2927 mov [cq], eobd 2928 mov r3d, 32 2929 imul r6d, 181 2930 add r6d, 128+512 2931 sar r6d, 8+2 2932 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 2933INIT_YMM avx512icl 2934ALIGN function_align 2935.main_fast: ; bottom half is zero 2936 ITX_UNPACK_MULHRSW 12, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a 2937 ITX_UNPACK_MULHRSW 21, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a 2938 ITX_UNPACK_MULHRSW 20, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a 2939 ITX_UNPACK_MULHRSW 19, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a 2940 jmp .main3 2941ALIGN function_align 2942.main: 2943 punpcklwd m12, m21, m14 ; in31 in1 2944 punpckhwd m14, m21 ; in3 in29 2945 punpcklwd m21, m20, m15 ; in27 in5 2946 punpckhwd m15, m20 ; in7 in25 2947 punpcklwd m20, m19, m16 ; in23 in9 2948 punpckhwd m16, m19 ; in11 in21 2949 punpcklwd m19, m18, m17 ; in19 in13 2950 punpckhwd m17, m18 ; in15 in17 2951.main2: 2952 ITX_MUL2X_PACK 12, 8, 9, 10, 201, 4091, 5 ; t16a, t31a 2953 ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a 2954 ITX_MUL2X_PACK 21, 8, 9, 10, 995, 3973, 5 ; t20a, t27a 2955 ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a 2956 ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a 2957 ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a 2958 ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a 2959 ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a 2960.main3: 2961 psubsw m11, m12, m17 ; t17 t30 2962 paddsw m12, m17 ; t16 t31 2963 psubsw m17, m15, m20 ; t18 t29 2964 paddsw m20, m15 ; t19 t28 2965 psubsw m15, m21, m16 ; t21 t26 2966 paddsw m21, m16 ; t20 t27 2967 psubsw m16, m14, m19 ; t22 t25 2968 paddsw m14, m19 ; t23 t24 2969 ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a 2970 ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a 2971 ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a 2972 ITX_MUL2X_PACK 16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a 2973 vpbroadcastd m8, [o(pw_m3784_1567)] 2974 psubsw m19, m12, m20 ; t19a t28a 2975 paddsw m20, m12 ; t16a t31a 2976 psubsw m12, m14, m21 ; t20a t27a 2977 paddsw m14, m21 ; t23a t24a 2978 psubsw m21, m11, m17 ; t18 t29 2979 paddsw m11, m17 ; t17 t30 2980 psubsw m17, m16, m15 ; t21 t26 2981 paddsw m16, m15 ; t22 t25 2982 ITX_MUL2X_PACK 21, 18, 15, 10, 1567_3784, 8, 20 ; t18a t29a 2983 ITX_MUL2X_PACK 19, 18, 15, 10, 1567_3784, 8, 20 ; t19 t28 2984 ITX_MUL2X_PACK 12, 18, 15, 10, 8, m1567_m3784, 36 ; t20 t27 2985 ITX_MUL2X_PACK 17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a 2986 vbroadcasti32x4 m18, [o(deint_shuf)] 2987 vpbroadcastd m8, [o(pw_m2896_2896)] 2988 vpbroadcastd m9, [o(pw_2896_2896)] 2989 psubsw m15, m20, m14 ; t23 t24 2990 paddsw m20, m14 ; t16 t31 2991 psubsw m14, m11, m16 ; t22a t25a 2992 paddsw m11, m16 ; t17a t30a 2993 psubsw m16, m21, m17 ; t21 t26 2994 paddsw m21, m17 ; t18 t29 2995 psubsw m17, m19, m12 ; t20a t27a 2996 paddsw m19, m12 ; t19a t28a 2997 REPX {pshufb x, m18}, m20, m11, m21, m19 2998 ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a 2999 ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25 3000 packssdw m18, m13 ; t23a t22 3001 packssdw m12, m15 ; t24a t25 3002 ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a 3003 ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27 3004 packssdw m16, m13 ; t20 t21a 3005 packssdw m14, m15 ; t27 t26a 3006 punpcklqdq m13, m19, m21 ; t19a t18 3007 punpckhqdq m19, m21 ; t28a t29 3008 punpcklqdq m21, m20, m11 ; t16 t17a 3009 punpckhqdq m20, m11 ; t31 t30a 3010 psubsw m15, m1, m19 ; out28 out29 3011 paddsw m1, m19 ; out3 out2 3012 psubsw m9, m6, m13 ; out19 out18 3013 paddsw m6, m13 ; out12 out13 3014 psubsw m10, m5, m16 ; out20 out21 3015 paddsw m5, m16 ; out11 out10 3016 psubsw m19, m3, m12 ; out24 out25 3017 paddsw m3, m12 ; out7 out6 3018 psubsw m8, m7, m21 ; out16 out17 3019 paddsw m7, m21 ; out15 out14 3020 psubsw m21, m0, m20 ; out31 out30 3021 paddsw m0, m20 ; out0 out1 3022 psubsw m11, m4, m18 ; out23 out22 3023 paddsw m4, m18 ; out8 out9 3024 psubsw m18, m2, m14 ; out27 out26 3025 paddsw m2, m14 ; out4 out5 3026INIT_ZMM avx512icl 3027 movu m16, [o(permD+3)] 3028 vpermt2q m0, m16, m4 ; 0 1 8 9 3029 vpermt2q m8, m16, m19 ; 16 17 24 25 3030 vpermt2q m1, m16, m5 ; 3 2 11 10 3031 vpermt2q m9, m16, m18 ; 19 18 27 26 3032 vpermt2q m2, m16, m6 ; 4 5 12 13 3033 vpermt2q m10, m16, m15 ; 20 21 28 29 3034 vpermt2q m3, m16, m7 ; 7 6 15 14 3035 vpermt2q m11, m16, m21 ; 23 22 31 30 3036 vzeroupper 3037 ret 3038 3039%macro LOAD_PACKED_16X2 3 ; dst, row[1-2] 3040 vbroadcasti32x4 ym%1, [cq+16*%2] 3041 vbroadcasti32x4 ym8, [cq+16*%3] 3042 shufpd ym%1, ym8, 0x0c 3043%endmacro 3044 3045cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob 3046%undef cmp 3047 test eobd, eobd 3048 jz .dconly 3049 lea r5, [o_base] 3050 LOAD_PACKED_16X2 0, 0, 2 ; in0 in2 3051 LOAD_PACKED_16X2 1, 4, 6 ; in4 in6 3052 LOAD_PACKED_16X2 2, 8, 10 ; in8 in10 3053 LOAD_PACKED_16X2 3, 12, 14 ; in12 in14 3054 LOAD_PACKED_16X2 14, 1, 3 ; in1 in3 3055 LOAD_PACKED_16X2 15, 5, 7 ; in5 in7 3056 LOAD_PACKED_16X2 16, 9, 11 ; in9 in11 3057 LOAD_PACKED_16X2 17, 13, 15 ; in13 in15 3058 pxor m4, m4 3059 REPX {mova [cq+64*x], m4}, 0, 1, 2, 3 3060 cmp eobd, 107 3061 jb .fast 3062 LOAD_PACKED_16X2 4, 16, 18 ; in16 in18 3063 LOAD_PACKED_16X2 5, 20, 22 ; in20 in22 3064 LOAD_PACKED_16X2 6, 24, 26 ; in24 in26 3065 LOAD_PACKED_16X2 7, 28, 30 ; in28 in30 3066 call m(idct_8x16_internal_8bpc).main 3067 LOAD_PACKED_16X2 18, 19, 17 ; in19 in17 3068 LOAD_PACKED_16X2 19, 23, 21 ; in23 in21 3069 LOAD_PACKED_16X2 20, 27, 25 ; in27 in25 3070 LOAD_PACKED_16X2 21, 31, 29 ; in31 in29 3071 pxor m8, m8 3072 REPX {mova [cq+64*x], m8}, 4, 5, 6, 7 3073 call m(inv_txfm_add_dct_dct_8x32_8bpc).main 3074 jmp .pass2 3075.fast: ; bottom half is zero 3076 mova ym5, ym4 3077 mova ym6, ym4 3078 mova ym7, ym4 3079 call m(idct_8x16_internal_8bpc).main 3080 call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast 3081.pass2: 3082 vpbroadcastd m12, [o(pw_8192)] 3083 vshufi32x4 m7, m3, m11, q2020 ; 7 15 23 31 3084 vshufi32x4 m6, m3, m11, q3131 ; 6 14 22 30 3085 vshufi32x4 m5, m2, m10, q3131 ; 5 13 21 29 3086 vshufi32x4 m4, m2, m10, q2020 ; 4 12 20 28 3087 vshufi32x4 m3, m1, m9, q2020 ; 3 11 19 27 3088 vshufi32x4 m2, m1, m9, q3131 ; 2 10 18 26 3089 vshufi32x4 m1, m0, m8, q3131 ; 1 9 17 15 3090 vshufi32x4 m0, m8, q2020 ; 0 8 16 24 3091 REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 3092 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 3093 call .main 3094 vpbroadcastd m8, [o(pw_2048)] 3095 REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 3096 lea r2, [strideq*3] 3097 lea r3, [dstq+strideq*4] 3098 movshdup m12, [o(permD)] 3099 pmovzxbw m8, [dstq+strideq*0] 3100 pmovzxbw m9, [dstq+strideq*1] 3101 pmovzxbw m10, [dstq+strideq*2] 3102 pmovzxbw m11, [dstq+r2 ] 3103 paddw m0, m8 3104 paddw m1, m9 3105 paddw m2, m10 3106 paddw m3, m11 3107 pmovzxbw m8, [r3+strideq*0] 3108 pmovzxbw m9, [r3+strideq*1] 3109 pmovzxbw m10, [r3+strideq*2] 3110 pmovzxbw m11, [r3+r2 ] 3111 paddw m4, m8 3112 paddw m5, m9 3113 paddw m6, m10 3114 paddw m7, m11 3115 packuswb m0, m1 3116 packuswb m2, m3 3117 vpermq m0, m12, m0 3118 vpermq m2, m12, m2 3119 mova [dstq+strideq*0], ym0 3120 vextracti32x8 [dstq+strideq*1], m0, 1 3121 mova [dstq+strideq*2], ym2 3122 vextracti32x8 [dstq+r2 ], m2, 1 3123 packuswb m4, m5 3124 packuswb m6, m7 3125 vpermq m4, m12, m4 3126 vpermq m6, m12, m6 3127 mova [r3+strideq*0], ym4 3128 vextracti32x8 [r3+strideq*1], m4, 1 3129 mova [r3+strideq*2], ym6 3130 vextracti32x8 [r3+r2 ], m6, 1 3131 RET 3132.dconly: 3133 movsx r6d, word [cq] 3134 mov [cq], eobd 3135 mov r3d, 8 3136.dconly2: 3137 imul r6d, 181 3138 add r6d, 128+512 3139 sar r6d, 8+2 3140.dconly3: 3141 imul r6d, 181 3142 add r6d, 128+2048 3143 sar r6d, 8+4 3144 pxor m2, m2 3145 vpbroadcastw m3, r6d 3146.dconly_loop: 3147 mova ym1, [dstq+strideq*0] 3148 vinserti32x8 m1, [dstq+strideq*1], 1 3149 punpcklbw m0, m1, m2 3150 punpckhbw m1, m2 3151 paddw m0, m3 3152 paddw m1, m3 3153 packuswb m0, m1 3154 mova [dstq+strideq*0], ym0 3155 vextracti32x8 [dstq+strideq*1], m0, 1 3156 lea dstq, [dstq+strideq*2] 3157 sub r3d, 2 3158 jg .dconly_loop 3159 RET 3160ALIGN function_align 3161.main: 3162 vpbroadcastd m10, [o(pd_2048)] 3163.main2: 3164 ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a 3165 ITX_MULSUB_2W 1, 7, 8, 9, 10, 799, 4017 ; t4a, t7a 3166 ITX_MULSUB_2W 2, 6, 8, 9, 10, 1567, 3784 ; t2, t3 3167 vpbroadcastd m11, [o(pw_2896_2896)] 3168 vpbroadcastd m12, [o(pw_m2896_2896)] 3169 ITX_MULSUB_2W 0, 4, 8, 9, 10, 11, 12 ; t1, t0 3170.main3: 3171 paddsw m8, m1, m5 ; t4 3172 psubsw m1, m5 ; t5a 3173 paddsw m9, m7, m3 ; t7 3174 psubsw m7, m3 ; t6a 3175 ITX_MULSUB_2W 7, 1, 3, 5, 10, 11, 12 ; t5, t6 3176 psubsw m5, m0, m2 ; dct4 out2 3177 paddsw m2, m0 ; dct4 out1 3178 paddsw m0, m4, m6 ; dct4 out0 3179 psubsw m4, m6 ; dct4 out3 3180 psubsw m6, m2, m1 ; out6 3181 paddsw m1, m2 ; out1 3182 paddsw m2, m5, m7 ; out2 3183 psubsw m5, m7 ; out5 3184 psubsw m7, m0, m9 ; out7 3185 paddsw m0, m9 ; out0 3186 paddsw m3, m4, m8 ; out3 3187 psubsw m4, m8 ; out4 3188 ret 3189 3190cglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c 3191 vpbroadcastd m7, [pw_5] 3192 paddsw m0, m7, [cq+64*0] 3193 paddsw m1, m7, [cq+64*1] 3194 vpbroadcastd ym9, strided 3195 paddsw m2, m7, [cq+64*2] 3196 paddsw m3, m7, [cq+64*3] 3197 paddsw m4, m7, [cq+64*4] 3198 paddsw m5, m7, [cq+64*5] 3199 paddsw m6, m7, [cq+64*6] 3200 paddsw m7, [cq+64*7] 3201 pmulld ym14, ym9, [pd_0to15] 3202 lea r3, [dstq+strideq*1] 3203 lea r4, [dstq+strideq*2] 3204 kxnorb k1, k1, k1 3205 pxor m13, m13 3206 add r1, r4 ; dstq+strideq*3 3207 kmovb k2, k1 3208 vpgatherdq m9{k1}, [r0+ym14*4] 3209 kmovb k1, k2 3210 vpgatherdq m10{k2}, [r3+ym14*4] 3211 kmovb k2, k1 3212 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 3213 REPX {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 3214 vpgatherdq m11{k1}, [r4+ym14*4] 3215 kmovb k1, k2 3216 vpgatherdq m12{k2}, [r1+ym14*4] 3217 REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 3218 punpcklbw m8, m9, m13 ; 0 8 16 24 3219 punpckhbw m9, m13 ; 4 12 20 28 3220 paddw m0, m8 3221 paddw m4, m9 3222 packuswb m0, m4 3223 kmovb k2, k1 3224 vpscatterdq [r0+ym14*4]{k1}, m0 3225 punpcklbw m8, m10, m13 ; 1 9 17 25 3226 punpckhbw m10, m13 ; 5 13 21 29 3227 paddw m1, m8 3228 paddw m5, m10 3229 packuswb m1, m5 3230 kmovb k1, k2 3231 vpscatterdq [r3+ym14*4]{k2}, m1 3232 punpcklbw m8, m11, m13 ; 2 10 18 26 3233 punpckhbw m11, m13 ; 6 14 22 30 3234 paddw m2, m8 3235 paddw m6, m11 3236 packuswb m2, m6 3237 kmovb k2, k1 3238 vpscatterdq [r4+ym14*4]{k1}, m2 3239 punpcklbw m8, m12, m13 ; 3 11 19 27 3240 punpckhbw m12, m13 ; 7 15 23 31 3241 paddw m3, m8 3242 paddw m7, m12 3243 packuswb m3, m7 3244 vpscatterdq [r1+ym14*4]{k2}, m3 3245 RET 3246 3247cglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c 3248 vpbroadcastd m0, [pw_4096] 3249 pmulhrsw m3, m0, [cq+64*0] 3250 pmulhrsw m4, m0, [cq+64*4] 3251 pmulhrsw m6, m0, [cq+64*1] 3252 pmulhrsw m5, m0, [cq+64*5] 3253 pmulhrsw m7, m0, [cq+64*2] 3254 pmulhrsw m2, m0, [cq+64*6] 3255 pmulhrsw m8, m0, [cq+64*3] 3256 pmulhrsw m0, [cq+64*7] 3257 mova m13, [int8_permA] 3258 lea r3, [strideq*3] 3259 lea r4, [dstq+strideq*4] 3260 punpckldq m1, m3, m4 3261 punpckhdq m3, m4 3262 punpckldq m4, m6, m5 3263 punpckhdq m6, m5 3264 punpckldq m5, m7, m2 3265 punpckhdq m7, m2 3266 punpckldq m2, m8, m0 3267 punpckhdq m8, m0 3268 mova ym9, [dstq+strideq*0] 3269 vinserti32x8 m9, [dstq+strideq*2], 1 3270 mova ym10, [dstq+strideq*1] 3271 vinserti32x8 m10, [dstq+r3 ], 1 3272 mova ym11, [r4+strideq*0] 3273 vinserti32x8 m11, [r4+strideq*2], 1 3274 mova ym12, [r4+strideq*1] 3275 vinserti32x8 m12, [r4+r3 ], 1 3276 REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8 3277 pxor m13, m13 3278 REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 3279 punpcklqdq m0, m1, m4 ; a0 a2 c0 c2 3280 punpckhqdq m1, m4 ; b0 b2 d0 d2 3281 punpcklqdq m4, m5, m2 ; a1 a3 c1 c3 3282 punpckhqdq m5, m2 ; b1 b3 d1 d3 3283 punpcklqdq m2, m3, m6 ; e0 e2 g0 g2 3284 punpckhqdq m3, m6 ; f0 f2 h0 h2 3285 punpcklqdq m6, m7, m8 ; e1 e3 g1 g3 3286 punpckhqdq m7, m8 ; f1 f3 h1 h3 3287 punpcklbw m8, m9, m13 3288 punpckhbw m9, m13 3289 paddw m0, m8 3290 paddw m4, m9 3291 packuswb m0, m4 3292 mova [dstq+strideq*0], ym0 3293 vextracti32x8 [dstq+strideq*2], m0, 1 3294 punpcklbw m8, m10, m13 3295 punpckhbw m10, m13 3296 paddw m1, m8 3297 paddw m5, m10 3298 packuswb m1, m5 3299 mova [dstq+strideq*1], ym1 3300 vextracti32x8 [dstq+r3 ], m1, 1 3301 punpcklbw m8, m11, m13 3302 punpckhbw m11, m13 3303 paddw m2, m8 3304 paddw m6, m11 3305 packuswb m2, m6 3306 mova [r4+strideq*0], ym2 3307 vextracti32x8 [r4+strideq*2], m2, 1 3308 punpcklbw m8, m12, m13 3309 punpckhbw m12, m13 3310 paddw m3, m8 3311 paddw m7, m12 3312 packuswb m3, m7 3313 mova [r4+strideq*1], ym3 3314 vextracti32x8 [r4+r3 ], m3, 1 3315 RET 3316 3317%macro IDCT_16x32_END 3 ; src[1-2], row 3318 mova xm8, [dstq+strideq*0] 3319 vinserti32x4 ym8, [dstq+strideq*1], 1 3320 mova xm9, [dstq+r3 ] 3321 vinserti32x4 ym9, [dstq+strideq*2], 1 3322 pmulhrsw m%1, m10 3323 pmulhrsw m%2, m10 3324 vpermb m8, m11, m8 3325 vpermb m9, m11, m9 3326 mova [cq+64*(%3*2+0)], m13 3327 mova [cq+64*(%3*2+1)], m13 3328 paddw m8, m%1 3329 paddw m9, m%2 3330 packuswb m8, m9 3331 vpermd m8, m12, m8 3332 mova [dstq+strideq*0], xm8 3333 vextracti32x4 [dstq+strideq*1], ym8, 1 3334 vextracti32x4 [dstq+strideq*2], m8, 2 3335 vextracti32x4 [dstq+r3 ], m8, 3 3336%if %1 != 20 3337 lea dstq, [dstq+strideq*4] 3338%endif 3339%endmacro 3340 3341cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob 3342%undef cmp 3343 lea r5, [o_base] 3344 test eobd, eobd 3345 jz .dconly 3346 vpbroadcastd m15, [o(pw_2896x8)] 3347 cmp eobd, 151 3348 jb .fast 3349 pmulhrsw m5, m15, [cq+64*10] 3350 pmulhrsw m3, m15, [cq+64* 6] 3351 pmulhrsw m1, m15, [cq+64* 2] 3352 pmulhrsw m7, m15, [cq+64*14] 3353 pmulhrsw m2, m15, [cq+64* 4] 3354 pmulhrsw m6, m15, [cq+64*12] 3355 pmulhrsw m0, m15, [cq+64* 0] 3356 pmulhrsw m4, m15, [cq+64* 8] 3357 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 3358 pmulhrsw m14, m15, [cq+64* 1] 3359 pmulhrsw m21, m15, [cq+64*15] 3360 pmulhrsw m18, m15, [cq+64* 9] 3361 pmulhrsw m17, m15, [cq+64* 7] 3362 pmulhrsw m16, m15, [cq+64* 5] 3363 pmulhrsw m19, m15, [cq+64*11] 3364 pmulhrsw m20, m15, [cq+64*13] 3365 pmulhrsw m15, [cq+64* 3] 3366 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 3367 mova m8, [o(idct_16x32p)] 3368 vpbroadcastd m9, [o(pw_16384)] 3369 REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7, \ 3370 m14, m15, m16, m17, m18, m19, m20, m21 3371 punpckldq m8, m0, m1 3372 punpckhdq m0, m1 3373 punpckldq m1, m2, m3 3374 punpckhdq m2, m3 3375 REPX {pmulhrsw x, m9}, m8, m0, m1, m2 3376 punpckldq m3, m4, m5 3377 punpckhdq m4, m5 3378 punpckldq m5, m6, m7 3379 punpckhdq m6, m7 3380 REPX {pmulhrsw x, m9}, m3, m4, m5, m6 3381 punpckldq m7, m14, m15 3382 punpckhdq m14, m15 3383 punpckldq m15, m16, m17 3384 punpckhdq m16, m17 3385 REPX {pmulhrsw x, m9}, m7, m14, m15, m16 3386 punpckldq m17, m18, m19 3387 punpckhdq m18, m19 3388 punpckldq m19, m20, m21 3389 punpckhdq m20, m21 3390 REPX {pmulhrsw x, m9}, m17, m18, m19, m20 3391 punpcklqdq m21, m8, m1 3392 punpckhqdq m8, m1 3393 punpcklqdq m1, m0, m2 3394 punpckhqdq m0, m2 3395 punpcklqdq m2, m3, m5 3396 punpckhqdq m3, m5 3397 punpcklqdq m5, m4, m6 3398 punpckhqdq m4, m6 3399 punpcklqdq m6, m7, m15 3400 punpckhqdq m7, m15 3401 punpcklqdq m15, m14, m16 3402 punpckhqdq m14, m16 3403 punpcklqdq m16, m17, m19 3404 punpckhqdq m17, m19 3405 punpcklqdq m19, m18, m20 3406 punpckhqdq m18, m20 3407 vinserti32x8 m20, m21, ym2, 1 3408 vshufi32x4 m21, m2, q3232 3409 vinserti32x8 m2, m8, ym3, 1 3410 vshufi32x4 m8, m3, q3232 3411 vinserti32x8 m3, m1, ym5, 1 3412 vshufi32x4 m1, m5, q3232 3413 vinserti32x8 m5, m0, ym4, 1 3414 vshufi32x4 m0, m4, q3232 3415 vinserti32x8 m4, m6, ym16, 1 3416 vshufi32x4 m6, m16, q3232 3417 vinserti32x8 m16, m7, ym17, 1 3418 vshufi32x4 m7, m17, q3232 3419 vinserti32x8 m17, m15, ym19, 1 3420 vshufi32x4 m15, m19, q3232 3421 vinserti32x8 m19, m14, ym18, 1 3422 vshufi32x4 m14, m18, q3232 3423 vshufi32x4 m18, m21, m6, q3131 ; 27 5 3424 vshufi32x4 m21, m6, q2020 ; 31 1 3425 vshufi32x4 m6, m8, m7, q2020 ; 24 8 3426 vshufi32x4 m8, m7, q3131 ; 30 2 3427 vshufi32x4 m7, m1, m15, q2020 ; 28 4 3428 vshufi32x4 m1, m15, q3131 ; 6 26 3429 vshufi32x4 m15, m0, m14, q2020 ; 7 25 3430 vshufi32x4 m0, m14, q3131 ; 14 18 3431 vshufi32x4 m14, m20, m4, q2020 ; 3 29 3432 vshufi32x4 m20, m4, q3131 ; 23 9 3433 vshufi32x4 m9, m3, m17, q2020 ; 16 0 3434 vshufi32x4 m3, m17, q3131 ; 12 20 3435 vshufi32x4 m17, m5, m19, q2020 ; 15 17 3436 vshufi32x4 m5, m19, q3131 ; 22 10 3437 vshufi32x4 m19, m2, m16, q2020 ; 19 13 3438 vshufi32x4 m16, m2, m16, q3131 ; 11 21 3439 call m(idct_16x16_internal_8bpc).main3 3440 call .main_oddhalf 3441 jmp .pass2 3442.fast: ; right half is zero 3443 mova ym8, [cq+64*15] 3444 vinserti32x8 m8, [cq+64* 1], 1 3445 mova m2, [o(int16_perm)] 3446 mova ym9, [cq+64* 8] 3447 vinserti32x8 m9, [cq+64* 0], 1 3448 mova ym0, [cq+64* 7] 3449 vinserti32x8 m0, [cq+64* 9], 1 3450 mova ym7, [cq+64*14] 3451 vinserti32x8 m7, [cq+64* 2], 1 3452 mova ym1, [cq+64* 3] 3453 vinserti32x8 m1, [cq+64*13], 1 3454 mova ym3, [cq+64* 6] 3455 vinserti32x8 m3, [cq+64*10], 1 3456 mova ym5, [cq+64*11] 3457 vinserti32x8 m5, [cq+64* 5], 1 3458 mova ym6, [cq+64*12] 3459 vinserti32x8 m6, [cq+64* 4], 1 3460 REPX {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6 3461 REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 3462 call m(idct_16x16_internal_8bpc).main2 3463 vbroadcasti32x4 m8, [o(int_shuf3)] 3464 vbroadcasti32x4 m9, [o(int_shuf4)] 3465 vpbroadcastd m11, [o(pw_16384)] 3466 pshufb m0, m8 3467 pshufb m1, m9 3468 pshufb m2, m8 3469 pshufb m3, m9 3470 REPX {pmulhrsw x, m11}, m0, m1, m2, m3 3471 pshufb m4, m8 3472 pshufb m5, m9 3473 pshufb m6, m8 3474 pshufb m7, m9 3475 REPX {pmulhrsw x, m11}, m4, m5, m6, m7 3476 punpckhdq m17, m0, m1 3477 punpckldq m0, m1 3478 punpckhdq m16, m2, m3 3479 punpckldq m2, m3 3480 punpckhdq m18, m4, m5 3481 punpckldq m4, m5 3482 punpckhdq m5, m6, m7 3483 punpckldq m6, m7 3484 vinserti32x8 m1, m0, ym2, 1 3485 vshufi32x4 m3, m0, m2, q3232 3486 vinserti32x8 m2, m4, ym6, 1 3487 vshufi32x4 m4, m6, q3232 3488 vinserti32x8 m15, m17, ym16, 1 3489 vshufi32x4 m17, m16, q3232 3490 vinserti32x8 m16, m18, ym5, 1 3491 vshufi32x4 m18, m5, q3232 3492 vshufi32x4 m0, m1, m2, q2020 ; 0 2 3493 vshufi32x4 m1, m2, q3131 ; 4 6 3494 vshufi32x4 m2, m3, m4, q2020 ; 8 10 3495 vshufi32x4 m3, m4, q3131 ; 12 14 3496 vshufi32x4 m14, m15, m16, q2020 ; 1 3 3497 vshufi32x4 m15, m16, q3131 ; 5 7 3498 vshufi32x4 m16, m17, m18, q2020 ; 9 11 3499 vshufi32x4 m17, m18, q3131 ; 13 15 3500 pxor m6, m6 3501 punpckhwd m8, m0, m0 3502 punpcklwd m9, m6, m0 3503 punpckhwd m0, m3, m3 3504 punpckhwd m5, m2, m2 3505 punpcklwd m7, m1, m1 3506 punpckhwd m1, m1 3507 punpcklwd m3, m3 3508 punpcklwd m6, m2 3509 call m(idct_16x16_internal_8bpc).main_fast5 3510 punpcklwd m21, m14, m14 3511 punpckhwd m14, m14 3512 punpcklwd m18, m15, m15 3513 punpckhwd m15, m15 3514 punpcklwd m20, m16, m16 3515 punpckhwd m16, m16 3516 punpcklwd m19, m17, m17 3517 punpckhwd m17, m17 3518 call .main_oddhalf_fast 3519.pass2: 3520 vpbroadcastd m10, [o(pw_2048)] 3521 mova m11, [o(end_16x32p)] 3522 lea r3, [strideq*3] 3523 pxor m13, m13 3524 psrld m12, m11, 8 3525 IDCT_16x32_END 0, 1, 0 3526 IDCT_16x32_END 2, 3, 1 3527 IDCT_16x32_END 4, 5, 2 3528 IDCT_16x32_END 6, 7, 3 3529 IDCT_16x32_END 14, 15, 4 3530 IDCT_16x32_END 16, 17, 5 3531 IDCT_16x32_END 18, 19, 6 3532 IDCT_16x32_END 20, 21, 7 3533 RET 3534ALIGN function_align 3535.dconly: 3536 movsx r6d, word [cq] 3537 mov [cq], eobd 3538 mov r3d, 32 3539 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly 3540ALIGN function_align 3541.main_oddhalf_fast2: ; bottom three-quarters are zero 3542 vpbroadcastd m8, [o(pw_201_4091x8)] 3543 vpbroadcastd m20, [o(pw_m1380_3857x8)] 3544 vpbroadcastd m9, [o(pw_995_3973x8)] 3545 vpbroadcastd m16, [o(pw_m601_4052x8)] 3546 pmulhrsw m21, m8 ; t16a, t31a 3547 pmulhrsw m20, m15 ; t19a, t28a 3548 pmulhrsw m18, m9 ; t20a, t27a 3549 pmulhrsw m14, m16 ; t23a, t24a 3550 mova m8, m21 3551 mova m17, m20 3552 mova m15, m18 3553 mova m16, m14 3554 jmp .main3 3555ALIGN function_align 3556.main_oddhalf_fast: ; bottom half is zero 3557 vpbroadcastd m8, [o(pw_201_4091x8)] 3558 vpbroadcastd m9, [o(pw_m2751_3035x8)] 3559 vpbroadcastd m11, [o(pw_1751_3703x8)] 3560 vpbroadcastd m12, [o(pw_m1380_3857x8)] 3561 pmulhrsw m21, m8 ; t16a, t31a 3562 vpbroadcastd m8, [o(pw_995_3973x8)] 3563 pmulhrsw m17, m9 ; t17a, t30a 3564 vpbroadcastd m9, [o(pw_m2106_3513x8)] 3565 pmulhrsw m20, m11 ; t18a, t29a 3566 vpbroadcastd m11, [o(pw_2440_3290x8)] 3567 pmulhrsw m15, m12 ; t19a, t28a 3568 vpbroadcastd m12, [o(pw_m601_4052x8)] 3569 pmulhrsw m18, m8 ; t20a, t27a 3570 pmulhrsw m16, m9 ; t21a, t26a 3571 pmulhrsw m19, m11 ; t22a, t25a 3572 pmulhrsw m14, m12 ; t23a, t24a 3573 jmp .main2 3574ALIGN function_align 3575.main_oddhalf: 3576 ITX_MUL2X_PACK 21, 8, 9, 10, 201, 4091, 5 ; t16a, t31a 3577 ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a 3578 ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a 3579 ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a 3580 ITX_MUL2X_PACK 18, 8, 9, 10, 995, 3973, 5 ; t20a, t27a 3581 ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a 3582 ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a 3583 ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a 3584.main2: 3585 psubsw m8, m21, m17 ; t17 t30 3586 paddsw m21, m17 ; t16 t31 3587 psubsw m17, m15, m20 ; t18 t29 3588 paddsw m20, m15 ; t19 t28 3589 psubsw m15, m18, m16 ; t21 t26 3590 paddsw m18, m16 ; t20 t27 3591 psubsw m16, m14, m19 ; t22 t25 3592 paddsw m14, m19 ; t23 t24 3593.main3: 3594 ITX_MUL2X_PACK 8, 9, 19, 10, 799, 4017, 5 ; t17a t30a 3595 ITX_MUL2X_PACK 17, 9, 19, 10, m4017, 799, 5 ; t18a t29a 3596 ITX_MUL2X_PACK 15, 9, 19, 10, 3406, 2276, 5 ; t21a t26a 3597 ITX_MUL2X_PACK 16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a 3598 vpbroadcastd m11, [o(pw_m3784_1567)] 3599 psubsw m19, m21, m20 ; t19a t28a 3600 paddsw m21, m20 ; t16a t31a 3601 psubsw m20, m14, m18 ; t20a t27a 3602 paddsw m14, m18 ; t23a t24a 3603 psubsw m18, m8, m17 ; t18 t29 3604 paddsw m8, m17 ; t17 t30 3605 psubsw m17, m16, m15 ; t21 t26 3606 paddsw m15, m16 ; t22 t25 3607 ITX_MUL2X_PACK 18, 9, 16, 10, 1567_3784, 11, 20 ; t18a t29a 3608 ITX_MUL2X_PACK 19, 9, 16, 10, 1567_3784, 11, 20 ; t19 t28 3609 ITX_MUL2X_PACK 20, 9, 16, 10, 11, m1567_m3784, 36 ; t20 t27 3610 ITX_MUL2X_PACK 17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a 3611 vbroadcasti32x4 m9, [o(deint_shuf)] 3612 psubsw m16, m21, m14 ; t23 t24 3613 paddsw m14, m21 ; t16 t31 3614 psubsw m21, m8, m15 ; t22a t25a 3615 paddsw m15, m8 ; t17a t30a 3616 psubsw m8, m18, m17 ; t21 t26 3617 paddsw m18, m17 ; t18 t29 3618 paddsw m17, m19, m20 ; t19a t28a 3619 psubsw m19, m20 ; t20a t27a 3620 vpbroadcastd m11, [o(pw_m2896_2896)] 3621 vpbroadcastd m12, [o(pw_2896_2896)] 3622 REPX {pshufb x, m9}, m14, m15, m18, m17 3623 mova m9, m10 3624 vpdpwssd m9, m16, m11 3625 mova m20, m10 3626 vpdpwssd m20, m21, m11 3627 psrad m9, 12 3628 psrad m20, 12 3629 packssdw m9, m20 ; t23a t22 3630 mova m20, m10 3631 vpdpwssd m20, m16, m12 3632 mova m16, m10 3633 vpdpwssd m16, m21, m12 3634 psrad m20, 12 3635 psrad m16, 12 3636 packssdw m16, m20, m16 ; t24a t25 3637 ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a 3638 ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27 3639 packssdw m11, m20 ; t27 t26a 3640 packssdw m8, m21 ; t20 t21a 3641 punpcklqdq m20, m14, m15 ; t16 t17a 3642 punpckhqdq m14, m15 ; t31 t30a 3643 punpckhqdq m15, m17, m18 ; t28a t29 3644 punpcklqdq m17, m18 ; t19a t18 3645 psubsw m21, m0, m14 ; out31 out30 3646 paddsw m0, m14 ; out0 out1 3647 psubsw m14, m7, m20 ; out16 out17 3648 paddsw m7, m20 ; out15 out14 3649 psubsw m20, m1, m15 ; out28 out29 3650 paddsw m1, m15 ; out3 out2 3651 psubsw m15, m6, m17 ; out19 out18 3652 paddsw m6, m17 ; out12 out13 3653 psubsw m17, m4, m9 ; out23 out22 3654 paddsw m4, m9 ; out8 out9 3655 psubsw m18, m3, m16 ; out24 out25 3656 paddsw m3, m16 ; out7 out6 3657 psubsw m16, m5, m8 ; out20 out21 3658 paddsw m5, m8 ; out11 out10 3659 psubsw m19, m2, m11 ; out27 out26 3660 paddsw m2, m11 ; out4 out5 3661 ret 3662 3663cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob 3664%undef cmp 3665 lea r5, [o_base] 3666 test eobd, eobd 3667 jz .dconly 3668 mova m21, [o(permB)] 3669 vpermq m1, m21, [cq+64* 0] ; 0 1 3670 vpermq m14, m21, [cq+64* 1] ; 2 3 3671 vpermq m20, m21, [cq+64* 2] ; 4 5 3672 vpermq m15, m21, [cq+64* 3] ; 6 7 3673 vpbroadcastd m8, [o(pw_2896x8)] 3674 vpermq m2, m21, [cq+64* 4] ; 8 9 3675 vpermq m16, m21, [cq+64* 5] ; 10 11 3676 vpermq m3, m21, [cq+64* 6] ; 12 13 3677 vpermq m17, m21, [cq+64* 7] ; 14 15 3678 REPX {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17 3679 pxor m12, m12 3680 REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7 3681 cmp eobd, 151 3682 jb .fast 3683 vpermq m9, m21, [cq+64* 8] ; 16 17 3684 vpermq m19, m21, [cq+64* 9] ; 18 19 3685 vpermq m4, m21, [cq+64*10] ; 20 21 3686 vpermq m5, m21, [cq+64*11] ; 22 23 3687 vpermq m6, m21, [cq+64*12] ; 24 25 3688 vpermq m18, m21, [cq+64*13] ; 26 27 3689 vpermq m7, m21, [cq+64*14] ; 28 29 3690 vpermq m21, m21, [cq+64*15] ; 30 31 3691 REPX {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21 3692 REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15 3693 punpcklwd m8, m21, m14 ; 30 2 3694 punpckhwd m21, m1 ; 31 1 3695 punpcklwd m0, m17, m19 ; 14 18 3696 punpckhwd m17, m9 ; 15 17 3697 punpcklwd m9, m1 ; 16 0 3698 punpckhwd m14, m7 ; 3 29 3699 punpcklwd m1, m15, m18 ; 6 26 3700 punpckhwd m15, m6 ; 7 25 3701 punpcklwd m6, m2 ; 24 8 3702 punpckhwd m19, m3 ; 19 13 3703 punpcklwd m3, m4 ; 12 20 3704 punpckhwd m18, m20 ; 27 5 3705 punpcklwd m7, m20 ; 28 4 3706 punpckhwd m20, m5, m2 ; 23 9 3707 punpcklwd m5, m16 ; 22 10 3708 punpckhwd m16, m4 ; 11 21 3709 call m(idct_16x16_internal_8bpc).main2 3710 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf 3711 jmp .pass2 3712.fast: ; bottom half zero 3713 punpcklwd m8, m14, m14 ; 2 3714 punpcklwd m0, m17, m17 ; 14 3715 punpcklwd m5, m16, m16 ; 10 3716 punpcklwd m9, m12, m1 ; __ 0 3717 punpckhwd m21, m1, m1 ; 1 3718 punpcklwd m1, m15, m15 ; 6 3719 punpcklwd m7, m20, m20 ; 4 3720 punpckhwd m19, m3, m3 ; 13 3721 punpcklwd m3, m3 ; 12 3722 punpcklwd m6, m12, m2 ; __ 8 3723 punpckhwd m18, m20, m20 ; 5 3724 punpckhwd m20, m2, m2 ; 9 3725 call m(idct_16x16_internal_8bpc).main_fast 3726 punpckhwd m15, m15 ; 7 3727 punpckhwd m14, m14 ; 3 3728 punpckhwd m16, m16 ; 11 3729 punpckhwd m17, m17 ; 15 3730 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 3731.pass2: 3732 vpbroadcastd m9, [o(pw_16384)] 3733 call .transpose_round 3734 vshufi32x4 m16, m14, m2, q3131 ; 5 3735 vshufi32x4 m14, m2, q2020 ; 1 3736 vshufi32x4 m2, m0, m3, q3131 ; 4 3737 vshufi32x4 m0, m3, q2020 ; 0 3738 vshufi32x4 m3, m1, m18, q3131 ; 6 3739 vshufi32x4 m1, m18, q2020 ; 2 3740 vshufi32x4 m18, m20, m6, q2020 ; 9 3741 vshufi32x4 m20, m6, q3131 ; 13 3742 vshufi32x4 m6, m21, m4, q3131 ; 12 3743 vshufi32x4 m4, m21, m4, q2020 ; 8 3744 vshufi32x4 m21, m19, m7, q3131 ; 15 3745 vshufi32x4 m19, m7, q2020 ; 11 3746 vshufi32x4 m7, m5, m15, q3131 ; 14 3747 vshufi32x4 m5, m15, q2020 ; 10 3748 vshufi32x4 m15, m17, m9, q2020 ; 3 3749 vshufi32x4 m17, m9, q3131 ; 7 3750 call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 3751 call .main_oddhalf 3752 vpbroadcastd m12, [o(pw_2048)] 3753 movshdup m13, [o(permD)] 3754 lea r2, [strideq*3] 3755 pmovzxbw m8, [dstq+strideq*0] 3756 pmovzxbw m9, [dstq+strideq*1] 3757 pmovzxbw m10, [dstq+strideq*2] 3758 pmovzxbw m11, [dstq+r2 ] 3759 REPX {pmulhrsw x, m12}, m0, m1, m2, m3 3760 lea r3, [dstq+strideq*4] 3761 paddw m0, m8 3762 paddw m1, m9 3763 paddw m2, m10 3764 paddw m3, m11 3765 pmovzxbw m8, [r3+strideq*0] 3766 pmovzxbw m9, [r3+strideq*1] 3767 pmovzxbw m10, [r3+strideq*2] 3768 pmovzxbw m11, [r3+r2 ] 3769 REPX {pmulhrsw x, m12}, m4, m5, m6, m7 3770 lea r4, [dstq+strideq*8] 3771 packuswb m0, m1 3772 paddw m4, m8 3773 paddw m5, m9 3774 packuswb m2, m3 3775 paddw m6, m10 3776 paddw m7, m11 3777 pmovzxbw m8, [r4+strideq*0] 3778 pmovzxbw m9, [r4+strideq*1] 3779 pmovzxbw m10, [r4+strideq*2] 3780 pmovzxbw m11, [r4+r2 ] 3781 REPX {pmulhrsw x, m12}, m14, m15, m16, m17 3782 lea r5, [r3+strideq*8] 3783 packuswb m4, m5 3784 paddw m14, m8 3785 paddw m15, m9 3786 packuswb m6, m7 3787 paddw m16, m10 3788 paddw m17, m11 3789 pmovzxbw m8, [r5+strideq*0] 3790 pmovzxbw m9, [r5+strideq*1] 3791 pmovzxbw m10, [r5+strideq*2] 3792 pmovzxbw m11, [r5+r2 ] 3793 REPX {pmulhrsw x, m12}, m18, m19, m20, m21 3794 packuswb m14, m15 3795 paddw m18, m8 3796 paddw m19, m9 3797 packuswb m16, m17 3798 paddw m20, m10 3799 paddw m21, m11 3800 packuswb m18, m19 3801 packuswb m20, m21 3802 REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20 3803 mova [dstq+strideq*0], ym0 3804 vextracti32x8 [dstq+strideq*1], m0, 1 3805 mova [dstq+strideq*2], ym2 3806 vextracti32x8 [dstq+r2 ], m2, 1 3807 mova [r3+strideq*0], ym4 3808 vextracti32x8 [r3+strideq*1], m4, 1 3809 mova [r3+strideq*2], ym6 3810 vextracti32x8 [r3+r2 ], m6, 1 3811 mova [r4+strideq*0], ym14 3812 vextracti32x8 [r4+strideq*1], m14, 1 3813 mova [r4+strideq*2], ym16 3814 vextracti32x8 [r4+r2 ], m16, 1 3815 mova [r5+strideq*0], ym18 3816 vextracti32x8 [r5+strideq*1], m18, 1 3817 mova [r5+strideq*2], ym20 3818 vextracti32x8 [r5+r2 ], m20, 1 3819 RET 3820ALIGN function_align 3821.dconly: 3822 movsx r6d, word [cq] 3823 mov [cq], eobd 3824 imul r6d, 181 3825 mov r3d, 16 3826 add r6d, 128 3827 sar r6d, 8 3828 imul r6d, 181 3829 add r6d, 128+256 3830 sar r6d, 8+1 3831 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 3832ALIGN function_align 3833.main_oddhalf_fast2: ; bottom three-quarters are zero 3834 vpbroadcastd m9, [o(pw_2896x8)] 3835 vpbroadcastd m2, [o(pw_4017x8)] 3836 vpbroadcastd m3, [o(pw_799x8)] 3837 vpbroadcastd m18, [o(pw_4076x8)] 3838 vpbroadcastd m19, [o(pw_401x8)] 3839 vpbroadcastd m20, [o(pw_m1189x8)] 3840 vpbroadcastd m16, [o(pw_3920x8)] 3841 pmulhrsw m9, m0 ; t0 3842 pmulhrsw m2, m1 ; t7a 3843 pmulhrsw m1, m3 ; t4a 3844 pmulhrsw m18, m14 ; t15a 3845 pmulhrsw m14, m19 ; t8a 3846 pmulhrsw m20, m15 ; t11a 3847 pmulhrsw m15, m16 ; t12a 3848 psubsw m7, m9, m2 ; idct8 out7 3849 paddsw m0, m9, m2 ; idct8 out0 3850 psubsw m4, m9, m1 ; idct8 out4 3851 paddsw m3, m9, m1 ; idct8 out3 3852 ITX_MULSUB_2W 2, 1, 5, 6, 10, 2896, 2896 ; t5, t6 3853 mova m21, m18 3854 mova m19, m14 3855 mova m16, m15 3856 mova m8, m20 3857 psubsw m6, m9, m1 ; idct8 out6 3858 paddsw m1, m9 ; idct8 out1 3859 psubsw m5, m9, m2 ; idct8 out5 3860 paddsw m2, m9 ; idct8 out2 3861 jmp .main3 3862ALIGN function_align 3863.main_oddhalf_fast: ; bottom half is zero 3864 vpbroadcastd m5, [o(pw_m2276x8)] 3865 vpbroadcastd m11, [o(pw_3406x8)] 3866 vpbroadcastd m7, [o(pw_4017x8)] 3867 vpbroadcastd m12, [o(pw_799x8)] 3868 vpbroadcastd m6, [o(pw_3784x8)] 3869 vpbroadcastd m10, [o(pw_1567x8)] 3870 vpbroadcastd m4, [o(pw_2896x8)] 3871 pmulhrsw m5, m3 ; t5a 3872 pmulhrsw m3, m11 ; t6a 3873 pmulhrsw m7, m1 ; t7a 3874 pmulhrsw m1, m12 ; t4a 3875 pmulhrsw m6, m2 ; t3 3876 pmulhrsw m2, m10 ; t2 3877 pmulhrsw m4, m0 ; t0 3878 vpbroadcastd m11, [o(pw_2896_2896)] 3879 vpbroadcastd m12, [o(pw_m2896_2896)] 3880 vpbroadcastd m10, [o(pd_2048)] 3881 mova m0, m4 ; t1 3882 call m(inv_txfm_add_dct_dct_32x8_8bpc).main3 3883 vpbroadcastd m21, [o(pw_4076x8)] 3884 vpbroadcastd m8, [o(pw_401x8)] 3885 vpbroadcastd m18, [o(pw_m2598x8)] 3886 vpbroadcastd m9, [o(pw_3166x8)] 3887 vpbroadcastd m19, [o(pw_3612x8)] 3888 vpbroadcastd m11, [o(pw_1931x8)] 3889 vpbroadcastd m20, [o(pw_m1189x8)] 3890 vpbroadcastd m12, [o(pw_3920x8)] 3891 pmulhrsw m21, m14 ; t15a 3892 pmulhrsw m14, m8 ; t8a 3893 pmulhrsw m18, m17 ; t9a 3894 pmulhrsw m17, m9 ; t14a 3895 pmulhrsw m19, m16 ; t13a 3896 pmulhrsw m16, m11 ; t10a 3897 pmulhrsw m20, m15 ; t11a 3898 pmulhrsw m15, m12 ; t12a 3899 jmp .main2 3900ALIGN function_align 3901.main_oddhalf: 3902 ITX_MULSUB_2W 14, 21, 8, 9, 10, 401, 4076 ; t8a, t15a 3903 ITX_MULSUB_2W 18, 17, 8, 9, 10, 3166, 2598 ; t9a, t14a 3904 ITX_MULSUB_2W 16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a 3905 ITX_MULSUB_2W 20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a 3906.main2: 3907 paddsw m8, m20, m16 ; t11 3908 psubsw m20, m16 ; t10 3909 paddsw m16, m15, m19 ; t12 3910 psubsw m15, m19 ; t13 3911 psubsw m19, m14, m18 ; t9 3912 paddsw m14, m18 ; t8 3913 psubsw m18, m21, m17 ; t14 3914 paddsw m21, m17 ; t15 3915.main3: 3916 vpbroadcastd m11, [o(pw_1567_3784)] 3917 vpbroadcastd m12, [o(pw_m3784_1567)] 3918 ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a 3919 vpbroadcastd m11, [o(pw_m1567_m3784)] 3920 ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a 3921 vpbroadcastd m11, [o(pw_2896_2896)] 3922 vpbroadcastd m12, [o(pw_m2896_2896)] 3923 psubsw m17, m14, m8 ; t11a 3924 paddsw m8, m14 ; t8a 3925 paddsw m14, m18, m15 ; t9 3926 psubsw m18, m15 ; t10 3927 psubsw m15, m19, m20 ; t13 3928 paddsw m19, m20 ; t14 3929 paddsw m20, m21, m16 ; t15a 3930 psubsw m16, m21, m16 ; t12a 3931 ITX_MULSUB_2W 15, 18, 9, 21, 10, 11, 12 ; t10a, t13a 3932 ITX_MULSUB_2W 16, 17, 9, 21, 10, 11, 12 ; t11, t12 3933 psubsw m21, m0, m20 ; out15 3934 paddsw m0, m20 ; out0 3935 psubsw m20, m1, m19 ; out14 3936 paddsw m1, m19 ; out1 3937 psubsw m19, m2, m18 ; out13 3938 paddsw m2, m18 ; out2 3939 psubsw m18, m3, m17 ; out12 3940 paddsw m3, m17 ; out3 3941 psubsw m17, m4, m16 ; out11 3942 paddsw m4, m16 ; out4 3943 psubsw m16, m5, m15 ; out10 3944 paddsw m5, m15 ; out5 3945 psubsw m15, m6, m14 ; out9 3946 paddsw m6, m14 ; out6 3947 psubsw m14, m7, m8 ; out8 3948 paddsw m7, m8 ; out7 3949 ret 3950.transpose_round: 3951 punpcklwd m8, m0, m2 3952 punpckhwd m0, m2 3953 punpcklwd m2, m1, m3 3954 punpckhwd m1, m3 3955 punpcklwd m3, m4, m6 3956 punpckhwd m4, m6 3957 punpcklwd m6, m5, m7 3958 punpckhwd m5, m7 3959 punpcklwd m7, m14, m16 3960 punpckhwd m14, m16 3961 punpcklwd m16, m15, m17 3962 punpckhwd m15, m17 3963 punpcklwd m17, m19, m21 3964 punpckhwd m19, m21 3965 punpckhwd m21, m18, m20 3966 punpcklwd m18, m20 3967 punpcklwd m20, m8, m1 3968 punpckhwd m8, m1 3969 punpcklwd m1, m0, m2 3970 punpckhwd m0, m2 3971 punpcklwd m2, m3, m5 3972 punpckhwd m3, m5 3973 punpcklwd m5, m4, m6 3974 punpckhwd m4, m6 3975 REPX {pmulhrsw x, m9}, m20, m8, m1, m0 3976 punpcklwd m6, m7, m15 3977 punpckhwd m7, m15 3978 punpcklwd m15, m14, m16 3979 punpckhwd m14, m16 3980 REPX {pmulhrsw x, m9}, m2, m3, m5, m4 3981 punpckhwd m16, m18, m19 3982 punpcklwd m18, m19 3983 punpcklwd m19, m21, m17 3984 punpckhwd m21, m17 3985 REPX {pmulhrsw x, m9}, m6, m7, m15, m14 3986 punpcklwd m17, m8, m0 ; a2 a6 aa ae 3987 punpckhwd m8, m0 ; a3 a7 ab af 3988 punpcklwd m0, m20, m1 ; a0 a4 a8 ac 3989 punpckhwd m20, m1 ; a1 a5 a9 ad 3990 REPX {pmulhrsw x, m9}, m16, m18, m19, m21 3991 punpcklwd m1, m2, m5 ; b0 b4 b8 bc 3992 punpckhwd m2, m5 ; b1 b5 b9 bd 3993 punpcklwd m5, m3, m4 ; b2 b6 ba be 3994 punpckhwd m3, m4 ; b3 b7 bb bf 3995 punpcklwd m4, m6, m15 ; c0 c4 c8 cc 3996 punpckhwd m6, m15 ; c1 c5 c9 cd 3997 punpcklwd m15, m7, m14 ; c2 c6 ca ce 3998 punpckhwd m7, m14 ; c3 c7 cb cf 3999 punpcklwd m14, m18, m19 ; d0 d4 d8 dc 4000 punpckhwd m18, m19 ; d1 d5 d9 dd 4001 punpcklwd m9, m16, m21 ; d2 d6 da de 4002 punpckhwd m16, m21 ; d3 d7 db df 4003 vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc 4004 vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4 4005 vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6 4006 vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be 4007 vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7 4008 vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf 4009 vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4 4010 vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc 4011 vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5 4012 vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd 4013 vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5 4014 vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd 4015 vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6 4016 vshufi32x4 m15, m9, q3232 ; ca ce da de 4017 vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7 4018 vshufi32x4 m7, m16, q3232 ; cb cf db df 4019 ret 4020 4021%macro IDTX_16x32 4 ; src/dst[1-4] 4022 pmulhrsw m%1, m15, [cq+64*%1] 4023 pmulhrsw m%2, m15, [cq+64*%2] 4024 pmulhrsw m%3, m15, [cq+64*%3] 4025 pmulhrsw m%4, m15, [cq+64*%4] 4026 pmulhrsw m18, m16, m%1 4027 pmulhrsw m19, m16, m%2 4028 pmulhrsw m20, m16, m%3 4029 pmulhrsw m21, m16, m%4 4030 REPX {pmulhrsw x, m17}, m18, m19, m20, m21 4031 paddsw m%1, m18 4032 paddsw m%2, m19 4033 paddsw m%3, m20 4034 paddsw m%4, m21 4035%endmacro 4036 4037%macro IDTX_16x32_STORE 2 ; src[1-2] 4038 mova xm17, [dstq+r3*0] 4039 vinserti128 ym17, [dstq+r3*4], 1 4040 vinserti32x4 m17, [dstq+r3*8], 2 4041 vinserti32x4 m17, [dstq+r4*8], 3 4042 mova [cq+64*(%1*2+0)], m18 4043 mova [cq+64*(%1*2+1)], m18 4044 punpcklbw m16, m17, m18 4045 punpckhbw m17, m18 4046 paddw m16, m%1 4047 paddw m17, m%2 4048 packuswb m16, m17 4049 mova [dstq+r3*0], xm16 4050 vextracti128 [dstq+r3*4], ym16, 1 4051 vextracti32x4 [dstq+r3*8], m16, 2 4052 vextracti32x4 [dstq+r4*8], m16, 3 4053%if %1 != 7 4054 add dstq, strideq 4055%endif 4056%endmacro 4057 4058cglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c 4059 vpbroadcastd m15, [pw_2896x8] 4060 vpbroadcastd m16, [pw_1697x16] 4061 vpbroadcastd m17, [pw_16384] 4062 IDTX_16x32 0, 1, 2, 3 4063 IDTX_16x32 4, 5, 6, 7 4064 IDTX_16x32 8, 9, 10, 11 4065 IDTX_16x32 12, 13, 14, 15 4066 vpbroadcastd m16, [pw_8192] 4067 call .transpose_2x8x8_round 4068 lea r3, [strideq*2] 4069 lea r4, [strideq*3] 4070 pxor m18, m18 4071 IDTX_16x32_STORE 0, 8 4072 IDTX_16x32_STORE 1, 9 4073 IDTX_16x32_STORE 2, 10 4074 IDTX_16x32_STORE 3, 11 4075 IDTX_16x32_STORE 4, 12 4076 IDTX_16x32_STORE 5, 13 4077 IDTX_16x32_STORE 6, 14 4078 IDTX_16x32_STORE 7, 15 4079 RET 4080ALIGN function_align 4081.transpose_2x8x8_round: 4082 punpckhwd m17, m4, m5 4083 punpcklwd m4, m5 4084 punpckhwd m5, m0, m1 4085 punpcklwd m0, m1 4086 punpckhwd m1, m6, m7 4087 punpcklwd m6, m7 4088 punpckhwd m7, m2, m3 4089 punpcklwd m2, m3 4090 punpckhdq m3, m0, m2 4091 punpckldq m0, m2 4092 punpckldq m2, m4, m6 4093 punpckhdq m4, m6 4094 punpckhdq m6, m5, m7 4095 punpckldq m5, m7 4096 punpckldq m7, m17, m1 4097 punpckhdq m17, m1 4098 REPX {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17 4099 punpckhqdq m1, m0, m2 4100 punpcklqdq m0, m2 4101 punpcklqdq m2, m3, m4 4102 punpckhqdq m3, m4 4103 punpcklqdq m4, m5, m7 4104 punpckhqdq m5, m7 4105 punpckhqdq m7, m6, m17 4106 punpcklqdq m6, m17 4107 punpckhwd m17, m12, m13 4108 punpcklwd m12, m13 4109 punpckhwd m13, m8, m9 4110 punpcklwd m8, m9 4111 punpckhwd m9, m14, m15 4112 punpcklwd m14, m15 4113 punpckhwd m15, m10, m11 4114 punpcklwd m10, m11 4115 punpckhdq m11, m8, m10 4116 punpckldq m8, m10 4117 punpckldq m10, m12, m14 4118 punpckhdq m12, m14 4119 punpckhdq m14, m13, m15 4120 punpckldq m13, m15 4121 punpckldq m15, m17, m9 4122 punpckhdq m17, m9 4123 REPX {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17 4124 punpckhqdq m9, m8, m10 4125 punpcklqdq m8, m10 4126 punpcklqdq m10, m11, m12 4127 punpckhqdq m11, m12 4128 punpcklqdq m12, m13, m15 4129 punpckhqdq m13, m15 4130 punpckhqdq m15, m14, m17 4131 punpcklqdq m14, m17 4132 ret 4133 4134%macro IDTX_32x16 4 ; dst[1-4] 4135 pmulhrsw m%2, m12, [cq+32*(%1+ 0)] 4136 pmulhrsw m18, m12, [cq+32*(%1+16)] 4137 pmulhrsw m%4, m12, [cq+32*(%3+ 0)] 4138 pmulhrsw m19, m12, [cq+32*(%3+16)] 4139 REPX {paddsw x, x}, m%2, m18, m%4, m19 4140 mova m%1, m14 4141 vpermi2q m%1, m%2, m18 4142 vpermt2q m%2, m16, m18 4143%if %3 != 14 4144 mova m%3, m14 4145%endif 4146 vpermi2q m%3, m%4, m19 4147 vpermt2q m%4, m16, m19 4148 pmulhrsw m18, m17, m%1 4149 pmulhrsw m19, m17, m%2 4150 pmulhrsw m20, m17, m%3 4151 pmulhrsw m21, m17, m%4 4152 REPX {paddsw x, x}, m%1, m%2, m%3, m%4 4153 paddsw m%1, m18 4154 paddsw m%2, m19 4155 paddsw m%3, m20 4156 paddsw m%4, m21 4157%endmacro 4158 4159%macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32 4160 mova ym19, [dstq+strideq*0] 4161 vinserti32x8 m19, [dstq+strideq*8], 1 4162%if %3 == 0 4163 mova [cq+64*(%1*2+0)], m20 4164 mova [cq+64*(%1*2+1)], m20 4165%endif 4166 punpcklbw m18, m19, m20 4167 punpckhbw m19, m20 4168 paddw m18, m%1 4169 paddw m19, m%2 4170 packuswb m18, m19 4171 mova [dstq+strideq*0], ym18 4172 vextracti32x8 [dstq+strideq*8], m18, 1 4173%if %3 || %1 != 7 4174 add dstq, strideq 4175%endif 4176%endmacro 4177 4178cglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c 4179 vpbroadcastd m12, [pw_2896x8] 4180 movu m14, [permB+7] 4181 vpbroadcastd m17, [pw_1697x16] 4182 psrlq m16, m14, 4 4183 IDTX_32x16 0, 1, 2, 3 4184 IDTX_32x16 4, 5, 6, 7 4185 IDTX_32x16 8, 9, 10, 11 4186 IDTX_32x16 12, 13, 14, 15 4187 vpbroadcastd m16, [pw_2048] 4188 call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round 4189 pxor m20, m20 4190 IDTX_32x16_STORE 0, 8 4191 IDTX_32x16_STORE 1, 9 4192 IDTX_32x16_STORE 2, 10 4193 IDTX_32x16_STORE 3, 11 4194 IDTX_32x16_STORE 4, 12 4195 IDTX_32x16_STORE 5, 13 4196 IDTX_32x16_STORE 6, 14 4197 IDTX_32x16_STORE 7, 15 4198 RET 4199 4200%macro IDCT_32x32_END 4 ; src, mem, stride[1-2] 4201 pmovzxbw m10, [dstq+%3] 4202 pmovzxbw m11, [r3 +%4] 4203%if %2 < 8 4204 paddsw m8, m%2, m%1 4205 psubsw m9, m%2, m%1 4206%else 4207 mova m9, [cq+64*(%2*2-16)] 4208 paddsw m8, m9, m%1 4209 psubsw m9, m%1 4210%endif 4211 pmulhrsw m8, m12 4212 pmulhrsw m9, m12 4213%if %2 >= 8 4214%if %2 == 8 4215 pxor m0, m0 4216%endif 4217 mova [cq+64*(%2*2-16)], m0 4218 mova [cq+64*(%2*2-15)], m0 4219%endif 4220 paddw m8, m10 4221 paddw m9, m11 4222 packuswb m8, m9 4223 vpermq m8, m13, m8 4224 mova [dstq+%3], ym8 4225 vextracti32x8 [r3 +%4], m8, 1 4226%if %2 == 3 || %2 == 7 || %2 == 11 4227 add dstq, r5 4228 sub r3, r5 4229%endif 4230%endmacro 4231 4232cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob 4233%undef cmp 4234 lea r5, [o_base] 4235 test eobd, eobd 4236 jz .dconly 4237 WIN64_SPILL_XMM 30 4238 cmp eobd, 136 4239 jb .fast 4240 mova m5, [cq+64*20] 4241 mova m3, [cq+64*12] 4242 mova m1, [cq+64* 4] 4243 mova m7, [cq+64*28] 4244 mova m2, [cq+64* 8] 4245 mova m6, [cq+64*24] 4246 mova m0, [cq+64* 0] 4247 mova m4, [cq+64*16] 4248 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 4249 mova m14, [cq+64* 2] 4250 mova m21, [cq+64*30] 4251 mova m18, [cq+64*18] 4252 mova m17, [cq+64*14] 4253 mova m16, [cq+64*10] 4254 mova m19, [cq+64*22] 4255 mova m20, [cq+64*26] 4256 mova m15, [cq+64* 6] 4257 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 4258 mova [cq+64* 0], m14 4259 mova [cq+64* 2], m15 4260 mova [cq+64* 4], m16 4261 mova [cq+64* 6], m17 4262 mova [cq+64* 8], m18 4263 mova [cq+64*10], m19 4264 mova [cq+64*12], m20 4265 mova [cq+64*14], m21 4266 mova m22, [cq+64* 1] 4267 mova m21, [cq+64*31] 4268 mova m14, [cq+64*17] 4269 mova m29, [cq+64*15] 4270 mova m26, [cq+64* 9] 4271 mova m17, [cq+64*23] 4272 mova m18, [cq+64*25] 4273 mova m25, [cq+64* 7] 4274 mova m24, [cq+64* 5] 4275 mova m19, [cq+64*27] 4276 mova m16, [cq+64*21] 4277 mova m27, [cq+64*11] 4278 mova m28, [cq+64*13] 4279 mova m15, [cq+64*19] 4280 mova m20, [cq+64*29] 4281 mova m23, [cq+64* 3] 4282 call .main_oddhalf 4283 vpbroadcastd m10, [o(pw_8192)] 4284 psubsw m13, m0, m29 ; 31 4285 paddsw m0, m29 ; 0 4286 psubsw m29, m1, m28 ; 30 4287 paddsw m1, m28 ; 1 4288 psubsw m28, m2, m27 ; 29 4289 paddsw m2, m27 ; 2 4290 psubsw m27, m3, m26 ; 28 4291 paddsw m3, m26 ; 3 4292 psubsw m26, m4, m25 ; 27 4293 paddsw m4, m25 ; 4 4294 psubsw m25, m5, m24 ; 26 4295 paddsw m5, m24 ; 5 4296 psubsw m24, m6, m23 ; 25 4297 paddsw m6, m23 ; 6 4298 psubsw m23, m7, m22 ; 24 4299 paddsw m7, m22 ; 7 4300 pxor m9, m9 4301 punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 4302 punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 4303 punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 4304 punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 4305 REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 4306 punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 4307 punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 4308 punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 4309 punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 4310 REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 4311 punpckhwd m3, m23, m24 4312 punpcklwd m23, m24 4313 punpckhwd m24, m25, m26 4314 punpcklwd m25, m26 4315 REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 4316 punpckhwd m26, m27, m28 4317 punpcklwd m27, m28 4318 punpckhwd m28, m29, m13 4319 punpcklwd m29, m13 4320 REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 4321 punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 4322 punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 4323 punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 4324 punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 4325 punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 4326 punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 4327 punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 4328 punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 4329 REPX {pmulhrsw x, m10}, m0, m4, m8, m22 4330 punpckhdq m13, m23, m25 4331 punpckldq m23, m25 4332 punpckhdq m25, m27, m29 4333 punpckldq m27, m29 4334 REPX {pmulhrsw x, m10}, m13, m23, m25, m27 4335 punpckhdq m9, m3, m24 4336 punpckldq m3, m24 4337 punpckhdq m24, m26, m28 4338 punpckldq m26, m28 4339 punpcklqdq m5, m23, m27 ; d00 d08 d16 d24 4340 punpckhqdq m23, m27 ; d01 d09 d17 d25 4341 punpckhqdq m27, m13, m25 ; d03 d11 d19 d27 4342 punpcklqdq m13, m25 ; d02 d10 d18 d26 4343 punpckhqdq m25, m3, m26 ; d05 d13 d21 d29 4344 punpcklqdq m3, m26 ; d04 d12 d20 d28 4345 punpckhqdq m26, m9, m24 ; d07 d15 d23 d31 4346 punpcklqdq m9, m24 ; d06 d14 d22 d30 4347 REPX {pmulhrsw x, m10}, m25, m3, m26 4348 mova [cq+64* 9], m23 4349 mova [cq+64*11], m27 4350 mova [cq+64*13], m25 4351 mova [cq+64*15], m26 4352 punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 4353 punpcklqdq m8, m22 ; a04 a12 a20 a28 4354 punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 4355 punpcklqdq m0, m4 ; a00 a08 a16 a24 4356 punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 4357 punpcklqdq m7, m2 ; a02 a10 a18 a26 4358 punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 4359 punpcklqdq m6, m1 ; a06 a14 a22 a30 4360 mova m2, [cq+64* 0] 4361 mova m11, [cq+64* 2] 4362 mova m12, [cq+64* 4] 4363 mova m29, [cq+64* 6] 4364 mova m27, [cq+64* 8] 4365 mova m26, [cq+64*10] 4366 mova m4, [cq+64*12] 4367 mova m28, [cq+64*14] 4368 psubsw m1, m2, m21 ; 23 4369 paddsw m2, m21 ; 8 4370 psubsw m21, m11, m20 ; 22 4371 paddsw m11, m20 ; 9 4372 psubsw m20, m12, m19 ; 21 4373 paddsw m12, m19 ; 10 4374 psubsw m19, m29, m18 ; 20 4375 paddsw m29, m18 ; 11 4376 psubsw m18, m27, m17 ; 19 4377 paddsw m27, m17 ; 12 4378 psubsw m17, m26, m16 ; 18 4379 paddsw m26, m16 ; 13 4380 paddsw m16, m4, m15 ; 14 4381 psubsw m4, m15 ; 17 4382 pmulhrsw m15, m6, m10 4383 psubsw m6, m28, m14 ; 16 4384 paddsw m28, m14 ; 15 4385 pmulhrsw m14, m7, m10 4386 punpcklwd m7, m6, m4 4387 punpckhwd m6, m4 4388 punpckhwd m4, m17, m18 4389 punpcklwd m17, m18 4390 punpckhwd m18, m19, m20 4391 punpcklwd m19, m20 4392 punpckhwd m20, m21, m1 4393 punpcklwd m21, m1 4394 punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 4395 punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 4396 punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 4397 punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 4398 punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 4399 punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 4400 punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 4401 punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 4402 pmulhrsw m23, m10 4403 pmulhrsw m25, m10 4404 punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3 4405 punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1 4406 punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3 4407 punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1 4408 REPX {pmulhrsw x, m10}, m28, m2, m12, m27 4409 punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 4410 punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 4411 punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 4412 punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 4413 REPX {pmulhrsw x, m10}, m16, m1, m11, m29 4414 punpckhdq m26, m19, m21 4415 punpckldq m19, m21 4416 punpckhdq m21, m6, m4 4417 punpckldq m6, m4 4418 REPX {pmulhrsw x, m10}, m26, m19, m21, m6 4419 punpckhdq m4, m18, m20 4420 punpckldq m18, m20 4421 punpckhdq m20, m7, m17 4422 punpckldq m7, m17 4423 REPX {pmulhrsw x, m10}, m4, m18, m20, m7 4424 punpcklqdq m17, m28, m12 ; b02 b10 b18 b26 4425 punpckhqdq m28, m12 ; b03 b11 b19 b27 4426 punpckhqdq m12, m2, m27 ; b01 b09 b17 b25 4427 punpcklqdq m2, m27 ; b00 b08 b16 b24 4428 punpckhqdq m27, m1, m29 ; b05 b13 b21 b29 4429 punpcklqdq m1, m29 ; b04 b12 b20 b28 4430 punpckhqdq m29, m16, m11 ; b07 b15 b23 b31 4431 punpcklqdq m16, m11 ; b06 b14 b22 b30 4432 mova [cq+64* 1], m12 4433 mova [cq+64* 3], m28 4434 mova [cq+64* 5], m27 4435 mova [cq+64* 7], m29 4436 punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 4437 punpcklqdq m20, m26 ; c02 c10 c18 c26 4438 punpckhqdq m26, m7, m19 ; c01 c09 c17 c25 4439 punpcklqdq m7, m19 ; c00 c08 c16 c24 4440 punpckhqdq m28, m6, m18 ; c05 c13 c21 c29 4441 punpcklqdq m6, m18 ; c04 c12 c20 c28 4442 punpckhqdq m29, m21, m4 ; c07 c15 c23 c31 4443 punpcklqdq m21, m4 ; c06 c14 c22 c30 4444 pmulhrsw m19, m9, m10 4445 vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24 4446 vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08 4447 vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24 4448 vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08 4449 vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28 4450 vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12 4451 vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28 4452 vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12 4453 vshufi32x4 m3, m1, m6, q3131 ; 12 4454 vshufi32x4 m1, m6, q2020 ; 4 4455 vshufi32x4 m6, m4, m2, q3131 ; 24 4456 vshufi32x4 m4, m2, q2020 ; 16 4457 vshufi32x4 m2, m0, m7, q3131 ; 8 4458 vshufi32x4 m0, m7, q2020 ; 0 4459 vshufi32x4 m7, m5, m8, q3131 ; 28 4460 vshufi32x4 m5, m8, q2020 ; 20 4461 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 4462 vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26 4463 vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10 4464 vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26 4465 vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10 4466 vshufi32x4 m13, m21, m19, q3232 ; c22 c30 d22 d30 4467 vinserti32x8 m21, ym19, 1 ; c06 c14 d06 d14 4468 vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30 4469 vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14 4470 vshufi32x4 m16, m14, m20, q3131 ; 10 4471 vshufi32x4 m14, m20, q2020 ; 2 4472 vshufi32x4 m20, m18, m17, q3131 ; 26 4473 vshufi32x4 m18, m17, q2020 ; 18 4474 vshufi32x4 m17, m15, m21, q3131 ; 14 4475 vshufi32x4 m15, m21, q2020 ; 6 4476 vshufi32x4 m21, m19, m13, q3131 ; 30 4477 vshufi32x4 m19, m13, q2020 ; 22 4478 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 4479 mova [cq+64* 0], m14 4480 mova [cq+64* 2], m15 4481 mova [cq+64* 4], m16 4482 mova [cq+64* 6], m17 4483 mova [cq+64* 8], m18 4484 mova [cq+64*10], m19 4485 mova [cq+64*12], m20 4486 mova [cq+64*14], m21 4487 mova m15, [cq+64* 1] 4488 mova m16, [cq+64* 3] 4489 mova m17, [cq+64* 5] 4490 mova m19, [cq+64* 7] 4491 mova m20, [cq+64* 9] 4492 mova m21, [cq+64*11] 4493 mova m13, [cq+64*13] 4494 mova m18, [cq+64*15] 4495 vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25 4496 vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09 4497 vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27 4498 vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11 4499 vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29 4500 vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13 4501 vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31 4502 vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15 4503 vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09 4504 vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25 4505 vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11 4506 vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27 4507 vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13 4508 vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29 4509 vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15 4510 vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31 4511 vshufi32x4 m18, m14, m26, q3131 ; 25 4512 vshufi32x4 m14, m26, q2020 ; 17 4513 vshufi32x4 m19, m15, m27, q3131 ; 27 4514 vshufi32x4 m15, m27, q2020 ; 19 4515 vshufi32x4 m20, m16, m28, q3131 ; 29 4516 vshufi32x4 m16, m28, q2020 ; 21 4517 vshufi32x4 m21, m17, m29, q3131 ; 31 4518 vshufi32x4 m17, m29, q2020 ; 23 4519 vshufi32x4 m26, m22, m8, q3131 ; 9 4520 vshufi32x4 m22, m8, q2020 ; 1 4521 vshufi32x4 m27, m23, m9, q3131 ; 11 4522 vshufi32x4 m23, m9, q2020 ; 3 4523 vshufi32x4 m28, m24, m11, q3131 ; 13 4524 vshufi32x4 m24, m11, q2020 ; 5 4525 vshufi32x4 m29, m25, m12, q3131 ; 15 4526 vshufi32x4 m25, m12, q2020 ; 7 4527 call .main_oddhalf 4528 jmp .end 4529.fast: ; bottom/right halves are zero 4530 mova m14, [o(dup16_perm)] 4531 pmovzxwd m9, [cq+64* 0] 4532 pmovzxwd m6, [cq+64* 8] 4533 vpermb m8, m14, [cq+64* 2] 4534 vpermb ym0, ym14, [cq+64*14] 4535 vpermb ym5, ym14, [cq+64*10] 4536 vpermb m1, m14, [cq+64* 6] 4537 vpermb m7, m14, [cq+64* 4] 4538 vpermb ym3, ym14, [cq+64*12] 4539 pslld m9, 16 4540 pslld m6, 16 4541 call m(idct_16x16_internal_8bpc).main_fast 4542 vpermb m21, m14, [cq+64* 1] 4543 vpermb ym17, ym14, [cq+64*15] 4544 vpermb ym20, ym14, [cq+64* 9] 4545 vpermb m15, m14, [cq+64* 7] 4546 vpermb m18, m14, [cq+64* 5] 4547 vpermb ym16, ym14, [cq+64*11] 4548 vpermb ym19, ym14, [cq+64*13] 4549 vpermb m14, m14, [cq+64* 3] 4550 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 4551 vpbroadcastd m9, [o(pw_8192)] 4552 call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round 4553 vshufi32x4 m22, m14, m2, q2020 ; 1 4554 vshufi32x4 m24, m14, m2, q3131 ; 5 4555 vshufi32x4 m23, m17, m9, q2020 ; 3 4556 vshufi32x4 m25, m17, m9, q3131 ; 7 4557 vshufi32x4 m16, m5, m15, q2020 ; 10 4558 vshufi32x4 m17, m5, m15, q3131 ; 14 4559 vshufi32x4 m14, m1, m18, q2020 ; 2 4560 vshufi32x4 m15, m1, m18, q3131 ; 6 4561 vshufi32x4 m1, m0, m3, q3131 ; 4 4562 vshufi32x4 m0, m3, q2020 ; 0 4563 vshufi32x4 m3, m21, m4, q3131 ; 12 4564 vshufi32x4 m2, m21, m4, q2020 ; 8 4565 vshufi32x4 m26, m20, m6, q2020 ; 9 4566 vshufi32x4 m28, m20, m6, q3131 ; 13 4567 vshufi32x4 m27, m19, m7, q2020 ; 11 4568 vshufi32x4 m29, m19, m7, q3131 ; 15 4569 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 4570 mova [cq+64* 0], m14 4571 mova [cq+64* 2], m15 4572 mova [cq+64* 4], m16 4573 mova [cq+64* 6], m17 4574 mova [cq+64* 8], m18 4575 mova [cq+64*10], m19 4576 mova [cq+64*12], m20 4577 mova [cq+64*14], m21 4578 call .main_oddhalf_fast 4579.end: 4580 lea r4, [strideq*3] 4581 vpbroadcastd m12, [o(pw_2048)] 4582 movshdup m13, [o(permD)] 4583 lea r3, [dstq+r4*8] 4584 lea r5, [strideq+r4] ; stride*4 4585 add r3, r5 ; dst+stride*28 4586 IDCT_32x32_END 29, 0, strideq*0, r4 4587 IDCT_32x32_END 28, 1, strideq*1, strideq*2 4588 IDCT_32x32_END 27, 2, strideq*2, strideq*1 4589 IDCT_32x32_END 26, 3, r4 , strideq*0 4590 IDCT_32x32_END 25, 4, strideq*0, r4 4591 IDCT_32x32_END 24, 5, strideq*1, strideq*2 4592 IDCT_32x32_END 23, 6, strideq*2, strideq*1 4593 IDCT_32x32_END 22, 7, r4 , strideq*0 4594 IDCT_32x32_END 21, 8, strideq*0, r4 4595 IDCT_32x32_END 20, 9, strideq*1, strideq*2 4596 IDCT_32x32_END 19, 10, strideq*2, strideq*1 4597 IDCT_32x32_END 18, 11, r4 , strideq*0 4598 IDCT_32x32_END 17, 12, strideq*0, r4 4599 IDCT_32x32_END 16, 13, strideq*1, strideq*2 4600 IDCT_32x32_END 15, 14, strideq*2, strideq*1 4601 IDCT_32x32_END 14, 15, r4 , strideq*0 4602 RET 4603.dconly: 4604 movsx r6d, word [cq] 4605 mov [cq], eobd 4606 mov r3d, 32 4607 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2 4608ALIGN function_align 4609.main_oddhalf_fast2: ; bottom three-quarters are zero 4610 vpbroadcastd m21, [o(pw_4091x8)] 4611 vpbroadcastd m8, [o(pw_201x8)] 4612 vpbroadcastd m18, [o(pw_m1380x8)] 4613 vpbroadcastd m9, [o(pw_3857x8)] 4614 vpbroadcastd m19, [o(pw_3973x8)] 4615 vpbroadcastd m11, [o(pw_995x8)] 4616 vpbroadcastd m28, [o(pw_m601x8)] 4617 vpbroadcastd m12, [o(pw_4052x8)] 4618 pmulhrsw m21, m22 ; t31a 4619 pmulhrsw m22, m8 ; t16a 4620 pmulhrsw m18, m25 ; t19a 4621 pmulhrsw m25, m9 ; t28a 4622 pmulhrsw m19, m24 ; t27a 4623 pmulhrsw m24, m11 ; t20a 4624 pmulhrsw m28, m23 ; t23a 4625 pmulhrsw m23, m12 ; t24a 4626 mova m15, m21 4627 mova m8, m22 4628 mova m14, m18 4629 mova m27, m25 4630 mova m29, m19 4631 mova m26, m24 4632 mova m16, m28 4633 mova m20, m23 4634 jmp .main3 4635ALIGN function_align 4636.main_oddhalf_fast: ; bottom half is zero 4637 vpbroadcastd m21, [o(pw_4091x8)] 4638 vpbroadcastd m8, [o(pw_201x8)] 4639 vpbroadcastd m14, [o(pw_m2751x8)] 4640 vpbroadcastd m9, [o(pw_3035x8)] 4641 vpbroadcastd m17, [o(pw_3703x8)] 4642 vpbroadcastd m11, [o(pw_1751x8)] 4643 vpbroadcastd m18, [o(pw_m1380x8)] 4644 vpbroadcastd m12, [o(pw_3857x8)] 4645 pmulhrsw m21, m22 ; t31a 4646 vpbroadcastd m19, [o(pw_3973x8)] 4647 pmulhrsw m22, m8 ; t16a 4648 vpbroadcastd m8, [o(pw_995x8)] 4649 pmulhrsw m14, m29 ; t30a 4650 vpbroadcastd m16, [o(pw_m2106x8)] 4651 pmulhrsw m29, m9 ; t17a 4652 vpbroadcastd m9, [o(pw_3513x8)] 4653 pmulhrsw m17, m26 ; t29a 4654 vpbroadcastd m15, [o(pw_3290x8)] 4655 pmulhrsw m26, m11 ; t18a 4656 vpbroadcastd m11, [o(pw_2440x8)] 4657 pmulhrsw m18, m25 ; t19a 4658 vpbroadcastd m20, [o(pw_m601x8)] 4659 pmulhrsw m25, m12 ; t28a 4660 vpbroadcastd m12, [o(pw_4052x8)] 4661 pmulhrsw m19, m24 ; t27a 4662 pmulhrsw m24, m8 ; t20a 4663 pmulhrsw m16, m27 ; t21a 4664 pmulhrsw m27, m9 ; t26a 4665 pmulhrsw m15, m28 ; t25a 4666 pmulhrsw m28, m11 ; t22a 4667 pmulhrsw m20, m23 ; t23a 4668 pmulhrsw m23, m12 ; t24a 4669 jmp .main2 4670ALIGN function_align 4671.main_oddhalf: 4672 ITX_MULSUB_2W 22, 21, 8, 9, 10, 201, 4091 ; t16a, t31a 4673 ITX_MULSUB_2W 14, 29, 8, 9, 10, 3035, 2751 ; t17a, t30a 4674 ITX_MULSUB_2W 26, 17, 8, 9, 10, 1751, 3703 ; t18a, t29a 4675 ITX_MULSUB_2W 18, 25, 8, 9, 10, 3857, 1380 ; t19a, t28a 4676 ITX_MULSUB_2W 24, 19, 8, 9, 10, 995, 3973 ; t20a, t27a 4677 ITX_MULSUB_2W 16, 27, 8, 9, 10, 3513, 2106 ; t21a, t26a 4678 ITX_MULSUB_2W 28, 15, 8, 9, 10, 2440, 3290 ; t22a, t25a 4679 ITX_MULSUB_2W 20, 23, 8, 9, 10, 4052, 601 ; t23a, t24a 4680.main2: 4681 psubsw m8, m22, m14 ; t17 4682 paddsw m22, m14 ; t16 4683 paddsw m14, m18, m26 ; t19 4684 psubsw m18, m26 ; t18 4685 psubsw m26, m24, m16 ; t21 4686 paddsw m24, m16 ; t20 4687 psubsw m16, m20, m28 ; t22 4688 paddsw m28, m20 ; t23 4689 psubsw m20, m23, m15 ; t25 4690 paddsw m23, m15 ; t24 4691 psubsw m15, m21, m29 ; t30 4692 paddsw m21, m29 ; t31 4693 psubsw m29, m19, m27 ; t26 4694 paddsw m19, m27 ; t27 4695 paddsw m27, m25, m17 ; t28 4696 psubsw m25, m17 ; t29 4697.main3: 4698 ITX_MULSUB_2W 15, 8, 9, 17, 10, 799, 4017 ; t17a, t30a 4699 ITX_MULSUB_2W 25, 18, 9, 17, 10, m4017, 799 ; t18a, t29a 4700 ITX_MULSUB_2W 29, 26, 9, 17, 10, 3406, 2276 ; t21a, t26a 4701 ITX_MULSUB_2W 20, 16, 9, 17, 10, m2276, 3406 ; t22a, t25a 4702 vpbroadcastd m12, [o(pw_m3784_1567)] 4703 vpbroadcastd m11, [o(pw_1567_3784)] 4704 psubsw m17, m21, m27 ; t28a 4705 paddsw m21, m27 ; t31a 4706 psubsw m27, m15, m25 ; t18 4707 paddsw m15, m25 ; t17 4708 psubsw m25, m20, m29 ; t21 4709 paddsw m20, m29 ; t22 4710 psubsw m29, m8, m18 ; t29 4711 paddsw m8, m18 ; t30 4712 psubsw m18, m22, m14 ; t19a 4713 paddsw m22, m14 ; t16a 4714 psubsw m14, m28, m24 ; t20a 4715 paddsw m24, m28 ; t23a 4716 paddsw m28, m16, m26 ; t25 4717 psubsw m16, m26 ; t26 4718 psubsw m26, m23, m19 ; t27a 4719 paddsw m23, m19 ; t24a 4720 ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a 4721 ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28 4722 vpbroadcastd m11, [o(pw_m1567_m3784)] 4723 ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a 4724 ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27 4725 vpbroadcastd m12, [o(pw_m2896_2896)] 4726 vpbroadcastd m11, [o(pw_2896_2896)] 4727 psubsw m19, m27, m25 ; t26 4728 paddsw m27, m25 ; t29 4729 psubsw m25, m17, m26 ; t20a 4730 paddsw m17, m26 ; t19a 4731 paddsw m26, m18, m14 ; t28a 4732 psubsw m18, m14 ; t27a 4733 paddsw m14, m22, m24 ; t16 4734 psubsw m22, m24 ; t23 4735 psubsw m24, m29, m16 ; t21 4736 paddsw m16, m29 ; t18 4737 paddsw m29, m21, m23 ; t31 4738 psubsw m21, m23 ; t24 4739 psubsw m23, m15, m20 ; t22a 4740 paddsw m15, m20 ; t17a 4741 psubsw m20, m8, m28 ; t25a 4742 paddsw m28, m8 ; t30a 4743 ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27 4744 ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a 4745 ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a 4746 ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25 4747 ret 4748 4749%macro IDTX_32x32 2 ; dst[1-2] 4750 vmovdqa32 ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which 4751 vmovdqa32 ym17, [cq+64*(%1+16)] ; reduces code size due to 4752 vmovdqa32 ym%2, [cq+64*(%2+ 0)] ; compressed displacements 4753 vmovdqa32 ym18, [cq+64*(%2+16)] 4754 vpermt2q m%1, m21, m17 4755 vpermt2q m%2, m21, m18 4756%endmacro 4757 4758cglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c 4759 movu m21, [permB+7] 4760 vpbroadcastd m16, [pw_8192] 4761 pxor m20, m20 4762.loop: 4763 IDTX_32x32 0, 1 4764 IDTX_32x32 2, 3 4765 IDTX_32x32 4, 5 4766 IDTX_32x32 6, 7 4767 IDTX_32x32 8, 9 4768 IDTX_32x32 10, 11 4769 IDTX_32x32 12, 13 4770 IDTX_32x32 14, 15 4771 call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round 4772 IDTX_32x16_STORE 0, 8, 1 4773 IDTX_32x16_STORE 1, 9, 1 4774 IDTX_32x16_STORE 2, 10, 1 4775 IDTX_32x16_STORE 3, 11, 1 4776 IDTX_32x16_STORE 4, 12, 1 4777 IDTX_32x16_STORE 5, 13, 1 4778 IDTX_32x16_STORE 6, 14, 1 4779 IDTX_32x16_STORE 7, 15, 1 4780 lea dstq, [dstq+strideq*8] 4781 btc cq, 5 4782 jnc .loop 4783 mov r0d, 8 4784.zero_loop: 4785 mova [cq+64*0], m20 4786 mova [cq+64*1], m20 4787 mova [cq+64*2], m20 4788 mova [cq+64*3], m20 4789 add cq, 64*4 4790 dec r0d 4791 jg .zero_loop 4792 RET 4793 4794cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob 4795%undef cmp 4796 lea r5, [o_base] 4797 test eobd, eobd 4798 jz .dconly 4799 WIN64_SPILL_XMM 30 4800 cmp eobd, 151 4801 jb .fast 4802 mova m5, [cq+64*10] 4803 mova m3, [cq+64* 6] 4804 mova m1, [cq+64* 2] 4805 mova m7, [cq+64*14] 4806 mova m2, [cq+64* 4] 4807 mova m6, [cq+64*12] 4808 mova m0, [cq+64* 0] 4809 mova m4, [cq+64* 8] 4810 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 4811 mova m14, [cq+64* 1] 4812 mova m21, [cq+64*15] 4813 mova m18, [cq+64* 9] 4814 mova m17, [cq+64* 7] 4815 mova m16, [cq+64* 5] 4816 mova m19, [cq+64*11] 4817 mova m20, [cq+64*13] 4818 mova m15, [cq+64* 3] 4819 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 4820 vpbroadcastd m9, [o(pw_8192)] 4821%macro TRANSPOSE_8x4_ROUND 4 4822 punpckhwd m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7 4823 punpcklwd m%3, m%4 ; c0 d0 c1 d1 c2 d2 c3 d3 4824 punpckhwd m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7 4825 punpcklwd m%1, m%2 ; a0 b0 a1 b1 a2 b2 a3 b3 4826 punpckhdq m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3 4827 punpckldq m%1, m%3 ; a0 b0 c0 d0 a1 b1 c1 d1 4828 punpckldq m%3, m%4, m8 ; a4 b4 c4 d4 a5 b5 c5 d5 4829 punpckhdq m%4, m8 ; a6 b6 c6 d6 a7 b7 c7 d7 4830 REPX {pmulhrsw x, m9}, m%2, m%1, m%3, m%4 4831%endmacro 4832 TRANSPOSE_8x4_ROUND 0, 1, 2, 3 4833 TRANSPOSE_8x4_ROUND 4, 5, 6, 7 4834 TRANSPOSE_8x4_ROUND 14, 15, 16, 17 4835 TRANSPOSE_8x4_ROUND 18, 19, 20, 21 4836 vinserti32x8 m26, m0, ym4, 1 ; a0 a4 b0 b4 4837 vshufi32x4 m0, m4, q3232 ; a8 a12 b8 b12 4838 vinserti32x8 m27, m1, ym5, 1 ; a1 a5 b1 b5 4839 vshufi32x4 m1, m5, q3232 ; a9 a13 b9 b13 4840 vinserti32x8 m28, m2, ym6, 1 ; a2 a6 b2 b6 4841 vshufi32x4 m2, m6, q3232 ; a10 a14 b10 b14 4842 vinserti32x8 m29, m3, ym7, 1 ; a3 a7 b3 b7 4843 vshufi32x4 m8, m3, m7, q3232 ; a11 a15 b11 b15 4844 vinserti32x8 m4, m14, ym18, 1 ; c0 c4 d0 d4 4845 vshufi32x4 m14, m18, q3232 ; c8 c12 d8 d12 4846 vinserti32x8 m5, m15, ym19, 1 ; c1 c5 d1 d5 4847 vshufi32x4 m15, m19, q3232 ; c9 c13 d9 d13 4848 vinserti32x8 m6, m16, ym20, 1 ; c2 c6 d2 d6 4849 vshufi32x4 m16, m20, q3232 ; c10 c14 d10 d14 4850 vinserti32x8 m7, m17, ym21, 1 ; c3 c7 d3 d7 4851 vshufi32x4 m17, m21, q3232 ; c11 c15 d11 d15 4852 vshufi32x4 m22, m26, m4, q2020 ; 0 1 4853 vshufi32x4 m26, m4, q3131 ; 8 9 4854 vshufi32x4 m23, m27, m5, q2020 ; 2 3 4855 vshufi32x4 m27, m5, q3131 ; 10 11 4856 vshufi32x4 m24, m28, m6, q2020 ; 4 5 4857 vshufi32x4 m28, m6, q3131 ; 12 13 4858 vshufi32x4 m25, m29, m7, q2020 ; 6 7 4859 vshufi32x4 m29, m7, q3131 ; 14 15 4860 vshufi32x4 m4, m0, m14, q2020 ; 16 17 4861 vshufi32x4 m3, m0, m14, q3131 ; 24 25 4862 vshufi32x4 m20, m1, m15, q2020 ; 18 19 4863 vshufi32x4 m19, m1, m15, q3131 ; 26 27 4864 vshufi32x4 m5, m2, m16, q2020 ; 20 21 4865 vshufi32x4 m0, m2, m16, q3131 ; 28 29 4866 vshufi32x4 m16, m8, m17, q2020 ; 22 23 4867 vshufi32x4 m17, m8, m17, q3131 ; 30 31 4868 pxor m6, m6 4869 mova [cq+64* 0], m4 4870 mova [cq+64* 2], m5 4871 mova [cq+64* 4], m3 4872 mova [cq+64* 6], m0 4873 punpcklwd m8, m24, m24 ; 4 4874 punpcklwd m0, m0 ; 28 4875 punpcklwd m5, m5 ; 20 4876 punpcklwd m1, m28, m28 ; 12 4877 punpcklwd m7, m26, m26 ; 8 4878 punpcklwd m3, m3 ; 24 4879 punpcklwd m9, m6, m22 ; __ 0 4880 punpcklwd m6, m4 ; __ 16 4881 call m(idct_16x16_internal_8bpc).main_fast3 4882 mova [cq+64* 1], m20 4883 mova [cq+64* 3], m16 4884 mova [cq+64* 5], m19 4885 mova [cq+64* 7], m17 4886 punpcklwd m21, m23, m23 ; 2 4887 punpcklwd m17, m17 ; 30 4888 punpcklwd m20, m20 ; 18 4889 punpcklwd m15, m29, m29 ; 14 4890 punpcklwd m18, m27, m27 ; 10 4891 punpcklwd m16, m16 ; 22 4892 punpcklwd m19, m19 ; 26 4893 punpcklwd m14, m25, m25 ; 6 4894 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 4895 mova [cq+64* 8], m14 4896 mova [cq+64* 9], m15 4897 mova [cq+64*10], m16 4898 mova [cq+64*11], m17 4899 mova [cq+64*12], m18 4900 mova [cq+64*13], m19 4901 mova [cq+64*14], m20 4902 mova [cq+64*15], m21 4903 mova m21, [cq+64* 7] 4904 mova m14, [cq+64* 0] 4905 mova m17, [cq+64* 3] 4906 mova m18, [cq+64* 4] 4907 mova m19, [cq+64* 5] 4908 mova m16, [cq+64* 2] 4909 mova m15, [cq+64* 1] 4910 mova m20, [cq+64* 6] 4911 REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \ 4912 m24, m19, m16, m27, m28, m15, m20, m23 4913 call .main_oddhalf 4914 jmp .end 4915.fast: ; right half is zero 4916 mova ym8, [cq+64*15] 4917 vinserti32x8 m8, [cq+64* 1], 1 4918 mova m2, [o(int16_perm)] 4919 mova ym9, [cq+64* 8] 4920 vinserti32x8 m9, [cq+64* 0], 1 4921 mova ym0, [cq+64* 7] 4922 vinserti32x8 m0, [cq+64* 9], 1 4923 mova ym7, [cq+64*14] 4924 vinserti32x8 m7, [cq+64* 2], 1 4925 mova ym1, [cq+64* 3] 4926 vinserti32x8 m1, [cq+64*13], 1 4927 mova ym3, [cq+64* 6] 4928 vinserti32x8 m3, [cq+64*10], 1 4929 mova ym5, [cq+64*11] 4930 vinserti32x8 m5, [cq+64* 5], 1 4931 mova ym6, [cq+64*12] 4932 vinserti32x8 m6, [cq+64* 4], 1 4933 REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 4934 call m(idct_16x16_internal_8bpc).main2 4935 vbroadcasti32x4 m8, [o(int_shuf3)] 4936 vbroadcasti32x4 m9, [o(int_shuf4)] 4937 vpbroadcastd m11, [o(pw_8192)] 4938 pshufb m0, m8 4939 pshufb m1, m9 4940 pshufb m2, m8 4941 pshufb m3, m9 4942 REPX {pmulhrsw x, m11}, m0, m1, m2, m3 4943 pshufb m4, m8 4944 pshufb m5, m9 4945 pshufb m6, m8 4946 pshufb m7, m9 4947 REPX {pmulhrsw x, m11}, m4, m5, m6, m7 4948 punpckhdq m28, m0, m1 4949 punpckldq m0, m1 4950 punpckhdq m27, m2, m3 4951 punpckldq m2, m3 4952 punpckhdq m22, m4, m5 4953 punpckldq m4, m5 4954 punpckhdq m23, m6, m7 4955 punpckldq m6, m7 4956 vinserti32x8 m14, m0, ym2, 1 4957 vshufi32x4 m15, m0, m2, q3232 4958 vinserti32x8 m2, m4, ym6, 1 4959 vshufi32x4 m4, m6, q3232 4960 vshufi32x4 m21, m14, m2, q2020 ; 0 2 4961 vshufi32x4 m14, m2, q3131 ; 4 6 4962 vshufi32x4 m18, m15, m4, q2020 ; 8 10 4963 vshufi32x4 m15, m4, q3131 ; 12 14 4964 pxor m9, m9 4965 punpcklwd m8, m14, m14 ; 4 4966 punpcklwd m1, m15, m15 ; 12 4967 punpcklwd m7, m18, m18 ; 8 4968 punpcklwd m9, m21 ; __ 0 4969 call m(idct_16x16_internal_8bpc).main_fast4 4970 punpckhwd m21, m21 ; 2 4971 punpckhwd m15, m15 ; 14 4972 punpckhwd m18, m18 ; 10 4973 punpckhwd m14, m14 ; 6 4974 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 4975 vinserti32x8 m24, m28, ym27, 1 4976 vshufi32x4 m28, m27, q3232 4977 vinserti32x8 m27, m22, ym23, 1 4978 vshufi32x4 m22, m23, q3232 4979 vshufi32x4 m23, m24, m27, q2020 ; 1 3 4980 vshufi32x4 m24, m27, q3131 ; 5 7 4981 vshufi32x4 m27, m28, m22, q2020 ; 9 11 4982 vshufi32x4 m28, m22, q3131 ; 13 15 4983 punpcklwd m22, m23, m23 ; 1 4984 punpckhwd m29, m28, m28 ; 15 4985 punpcklwd m26, m27, m27 ; 9 4986 punpckhwd m25, m24, m24 ; 7 4987 mova [cq+64* 8], m14 4988 mova [cq+64* 9], m15 4989 mova [cq+64*10], m16 4990 mova [cq+64*11], m17 4991 punpcklwd m24, m24 ; 5 4992 punpckhwd m27, m27 ; 11 4993 punpcklwd m28, m28 ; 13 4994 punpckhwd m23, m23 ; 3 4995 mova [cq+64*12], m18 4996 mova [cq+64*13], m19 4997 mova [cq+64*14], m20 4998 mova [cq+64*15], m21 4999 call .main_oddhalf_fast 5000.end: 5001 imul r6, strideq, 60 5002 mova m10, [o(end_16x32p)] 5003 vpbroadcastd m11, [o(pw_2048)] 5004 lea r3, [strideq*3] 5005 pxor m12, m12 5006 add r6, dstq ; dst+stride*60 5007 psrldq m13, m10, 1 5008 lea r4, [strideq+r3] ; stride*4 5009%macro IDCT_16x64_END 3 ; idct32, idct64, tmp 5010%if %1 & 1 5011 %define %%s0 r3 5012 %define %%s1 strideq*2 5013 %define %%s2 strideq*1 5014 %define %%s3 strideq*0 5015%else 5016 %define %%s0 strideq*0 5017 %define %%s1 strideq*1 5018 %define %%s2 strideq*2 5019 %define %%s3 r3 5020%if %1 5021 add dstq, r4 5022 sub r6, r4 5023%endif 5024%endif 5025%if %1 < 8 5026 pmulhrsw m8, m11, m%1 5027 pmulhrsw m9, m11, m%2 5028%else 5029 mova m9, [cq+64*%1] 5030 paddsw m8, m9, m%2 ; out 0+n, 1+n 5031 psubsw m9, m%2 ; out 63-n, 62-n 5032 pmulhrsw m8, m11 5033 pmulhrsw m9, m11 5034%endif 5035 mova xm29, [dstq+%%s0] 5036 vinserti128 ym29, [dstq+%%s1], 1 5037 mova xm%3, [r6 +%%s3] 5038 vinserti128 ym%3, [r6 +%%s2], 1 5039 vpermb m29, m10, m29 5040 vpermb m%3, m10, m%3 5041 mova [cq+64*%1], m12 5042 paddw m29, m8 5043 paddw m%3, m9 5044 packuswb m29, m%3 5045 vpermd m29, m13, m29 5046 mova [dstq+%%s0], xm29 5047 vextracti128 [dstq+%%s1], ym29, 1 5048 vextracti32x4 [r6 +%%s2], m29, 2 5049 vextracti32x4 [r6 +%%s3], m29, 3 5050%endmacro 5051 IDCT_16x64_END 0, 29, 0 5052 IDCT_16x64_END 1, 28, 28 5053 IDCT_16x64_END 2, 27, 28 5054 IDCT_16x64_END 3, 26, 28 5055 IDCT_16x64_END 4, 25, 28 5056 IDCT_16x64_END 5, 24, 28 5057 IDCT_16x64_END 6, 23, 28 5058 IDCT_16x64_END 7, 22, 28 5059 IDCT_16x64_END 8, 21, 28 5060 IDCT_16x64_END 9, 20, 28 5061 IDCT_16x64_END 10, 19, 28 5062 IDCT_16x64_END 11, 18, 28 5063 IDCT_16x64_END 12, 17, 28 5064 IDCT_16x64_END 13, 16, 28 5065 IDCT_16x64_END 14, 15, 28 5066 IDCT_16x64_END 15, 14, 28 5067 RET 5068.dconly: 5069 movsx r6d, word [cq] 5070 mov [cq], eobd 5071 imul r6d, 181 5072 mov r3d, 64 5073 add r6d, 128+512 5074 sar r6d, 8+2 5075 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 5076ALIGN function_align 5077.main_oddhalf_fast: ; bottom three-quarters are zero 5078 vpbroadcastd m8, [o(pw_101_4095x8)] 5079 vpbroadcastd m21, [o(pw_m1474_3822x8)] 5080 vpbroadcastd m14, [o(pw_897_3996x8)] 5081 vpbroadcastd m17, [o(pw_m700_4036x8)] 5082 vpbroadcastd m18, [o(pw_501_4065x8)] 5083 vpbroadcastd m19, [o(pw_m1092_3948x8)] 5084 vpbroadcastd m16, [o(pw_1285_3889x8)] 5085 vpbroadcastd m15, [o(pw_m301_4085x8)] 5086 pmulhrsw m8, m22 ; t32a t63a 5087 pmulhrsw m21, m29 ; t35a t60a 5088 pmulhrsw m14, m26 ; t36a t59a 5089 pmulhrsw m17, m25 ; t39a t56 5090 pmulhrsw m18, m24 ; t40a t55a 5091 pmulhrsw m19, m27 ; t43a t52a 5092 pmulhrsw m16, m28 ; t44a t51a 5093 pmulhrsw m15, m23 ; t47a t48a 5094 mova m22, m8 5095 mova m29, m21 5096 mova m26, m14 5097 mova m25, m17 5098 mova m24, m18 5099 mova m27, m19 5100 mova m28, m16 5101 mova m20, m15 5102 jmp .main_oddhalf2 5103ALIGN function_align 5104.main_oddhalf: 5105 vpbroadcastd m8, [o(pw_101_4095x8)] 5106 vpbroadcastd m9, [o(pw_m2824_2967x8)] 5107 vpbroadcastd m11, [o(pw_1660_3745x8)] 5108 vpbroadcastd m12, [o(pw_m1474_3822x8)] 5109 pmulhrsw m22, m8 ; t32a t63a 5110 vpbroadcastd m8, [o(pw_897_3996x8)] 5111 pmulhrsw m21, m9 ; t33a t62a 5112 vpbroadcastd m9, [o(pw_m2191_3461x8)] 5113 pmulhrsw m14, m11 ; t34a t61a 5114 vpbroadcastd m11, [o(pw_2359_3349x8)] 5115 pmulhrsw m29, m12 ; t35a t60a 5116 vpbroadcastd m12, [o(pw_m700_4036x8)] 5117 pmulhrsw m26, m8 ; t36a t59a 5118 vpbroadcastd m8, [o(pw_501_4065x8)] 5119 pmulhrsw m17, m9 ; t37a t58a 5120 vpbroadcastd m9, [o(pw_m2520_3229x8)] 5121 pmulhrsw m18, m11 ; t38a t57a 5122 vpbroadcastd m11, [o(pw_2019_3564x8)] 5123 pmulhrsw m25, m12 ; t39a t56a 5124 vpbroadcastd m12, [o(pw_m1092_3948x8)] 5125 pmulhrsw m24, m8 ; t40a t55a 5126 vpbroadcastd m8, [o(pw_1285_3889x8)] 5127 pmulhrsw m19, m9 ; t41a t54a 5128 vpbroadcastd m9, [o(pw_m1842_3659x8)] 5129 pmulhrsw m16, m11 ; t42a t53a 5130 vpbroadcastd m11, [o(pw_2675_3102x8)] 5131 pmulhrsw m27, m12 ; t43a t52a 5132 vpbroadcastd m12, [o(pw_m301_4085x8)] 5133 pmulhrsw m28, m8 ; t44a t51a 5134 pmulhrsw m15, m9 ; t45a t50a 5135 pmulhrsw m20, m11 ; t46a t49a 5136 pmulhrsw m23, m12 ; t47a t48a 5137 psubsw m8, m22, m21 ; t33 t62 5138 paddsw m22, m21 ; t32 t63 5139 psubsw m21, m29, m14 ; t34 t61 5140 paddsw m29, m14 ; t35 t60 5141 psubsw m14, m26, m17 ; t37 t58 5142 paddsw m26, m17 ; t36 t59 5143 psubsw m17, m25, m18 ; t38 t57 5144 paddsw m25, m18 ; t39 t56 5145 psubsw m18, m24, m19 ; t41 t54 5146 paddsw m24, m19 ; t40 t55 5147 psubsw m19, m27, m16 ; t42 t53 5148 paddsw m27, m16 ; t43 t52 5149 psubsw m16, m28, m15 ; t45 t50 5150 paddsw m28, m15 ; t44 t51 5151 psubsw m15, m23, m20 ; t46 t49 5152 paddsw m20, m23 ; t47 t48 5153.main_oddhalf2: 5154 ITX_MUL2X_PACK 8, 9, 23, 10, 401, 4076, 5 ; t33a t62a 5155 ITX_MUL2X_PACK 21, 9, 23, 10, m4076, 401, 5 ; t34a t61a 5156 ITX_MUL2X_PACK 14, 9, 23, 10, 3166, 2598, 5 ; t37a t58a 5157 ITX_MUL2X_PACK 17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a 5158 ITX_MUL2X_PACK 18, 9, 23, 10, 1931, 3612, 5 ; t41a t54a 5159 ITX_MUL2X_PACK 19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a 5160 ITX_MUL2X_PACK 16, 9, 23, 10, 3920, 1189, 5 ; t45a t50a 5161 ITX_MUL2X_PACK 15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a 5162 vpbroadcastd m11, [o(pw_m4017_799)] 5163 psubsw m23, m25, m26 ; t36a t59a 5164 paddsw m25, m26 ; t39a t56a 5165 psubsw m26, m24, m27 ; t43a t52a 5166 paddsw m27, m24 ; t40a t55a 5167 psubsw m24, m20, m28 ; t44a t51a 5168 paddsw m20, m28 ; t47a t48a 5169 psubsw m28, m8, m21 ; t34 t61 5170 paddsw m8, m21 ; t33 t62 5171 psubsw m21, m17, m14 ; t37 t58 5172 paddsw m17, m14 ; t38 t57 5173 psubsw m14, m18, m19 ; t42 t53 5174 paddsw m18, m19 ; t41 t54 5175 psubsw m19, m15, m16 ; t45 t50 5176 paddsw m15, m16 ; t46 t49 5177 psubsw m16, m22, m29 ; t35a t60a 5178 paddsw m22, m29 ; t32a t63a 5179 ITX_MUL2X_PACK 16, 9, 29, 10, 799_4017, 11, 20 ; t35 t60 5180 ITX_MUL2X_PACK 28, 9, 29, 10, 799_4017, 11, 20 ; t34a t61a 5181 ITX_MUL2X_PACK 23, 9, 29, 10, 11, m799_m4017, 36 ; t36 t59 5182 ITX_MUL2X_PACK 21, 9, 29, 10, 11, m799_m4017, 36 ; t37a t58a 5183 vpbroadcastd m11, [o(pw_m2276_3406)] 5184 ITX_MUL2X_PACK 26, 9, 29, 10, 3406_2276, 11, 20 ; t43 t52 5185 ITX_MUL2X_PACK 14, 9, 29, 10, 3406_2276, 11, 20 ; t42a t53a 5186 ITX_MUL2X_PACK 24, 9, 29, 10, 11, m3406_m2276, 36 ; t44 t51 5187 ITX_MUL2X_PACK 19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a 5188 vpbroadcastd m11, [o(pw_1567_3784)] 5189 vpbroadcastd m12, [o(pw_m3784_1567)] 5190 psubsw m29, m22, m25 ; t39 t56 5191 paddsw m22, m25 ; t32 t63 5192 psubsw m25, m20, m27 ; t40 t55 5193 paddsw m20, m27 ; t47 t48 5194 psubsw m27, m8, m17 ; t38a t57a 5195 paddsw m8, m17 ; t33a t62a 5196 psubsw m17, m15, m18 ; t41a t54a 5197 paddsw m15, m18 ; t46a t49a 5198 paddsw m18, m16, m23 ; t35a t60a 5199 psubsw m16, m23 ; t36a t59a 5200 psubsw m23, m24, m26 ; t43a t52a 5201 paddsw m24, m26 ; t44a t51a 5202 paddsw m26, m28, m21 ; t34 t61 5203 psubsw m28, m21 ; t37 t58 5204 psubsw m21, m19, m14 ; t42 t53 5205 paddsw m19, m14 ; t45 t50 5206 ITX_MUL2X_PACK 29, 9, 14, 10, 11, 12, 4 ; t39a t56a 5207 ITX_MUL2X_PACK 27, 9, 14, 10, 11, 12, 4 ; t38 t57 5208 ITX_MUL2X_PACK 16, 9, 14, 10, 11, 12, 4 ; t36 t59 5209 ITX_MUL2X_PACK 28, 9, 14, 10, 11, 12, 4 ; t37a t58a 5210 vpbroadcastd m11, [o(pw_m1567_m3784)] 5211 ITX_MUL2X_PACK 25, 9, 14, 10, 12, 11, 4 ; t40a t55a 5212 ITX_MUL2X_PACK 17, 9, 14, 10, 12, 11, 4 ; t41 t54 5213 ITX_MUL2X_PACK 23, 9, 14, 10, 12, 11, 4 ; t43 t52 5214 ITX_MUL2X_PACK 21, 9, 14, 10, 12, 11, 4 ; t42a t53a 5215 vbroadcasti32x4 m13, [o(deint_shuf)] 5216 vpbroadcastd m11, [o(pw_2896_2896)] 5217 vpbroadcastd m12, [o(pw_m2896_2896)] 5218 paddsw m14, m22, m20 ; t32a t63a 5219 psubsw m22, m20 ; t47a t48a 5220 psubsw m20, m8, m15 ; t46 t49 5221 paddsw m8, m15 ; t33 t62 5222 paddsw m15, m18, m24 ; t35 t60 5223 psubsw m18, m24 ; t44 t51 5224 psubsw m24, m26, m19 ; t45a t50a 5225 paddsw m26, m19 ; t34a t61a 5226 REPX {pshufb x, m13}, m14, m8, m15, m26 5227 psubsw m19, m29, m25 ; t40 t55 5228 paddsw m25, m29 ; t39 t56 5229 psubsw m29, m27, m17 ; t41a t54a 5230 paddsw m27, m17 ; t38a t57a 5231 psubsw m17, m16, m23 ; t43a t52a 5232 paddsw m16, m23 ; t36a t59a 5233 psubsw m9, m28, m21 ; t42 t53 5234 paddsw m28, m21 ; t37 t58 5235 REPX {pshufb x, m13}, m25, m27, m16, m28 5236 ITX_MUL2X_PACK 22, 13, 21, 10, 11, 12, 8 ; t47 t48 5237 ITX_MUL2X_PACK 20, 23, 22, 10, 11, 12, 8 ; t46a t49a 5238 packssdw m21, m22 ; t47 t46a 5239 packssdw m13, m23 ; t48 t49a 5240 ITX_MUL2X_PACK 18, 22, 20, 10, 11, 12, 8 ; t44a t51a 5241 ITX_MUL2X_PACK 24, 23, 18, 10, 11, 12, 8 ; t45 t50 5242 packssdw m20, m18 ; t44a t45 5243 packssdw m22, m23 ; t51a t50 5244 ITX_MUL2X_PACK 19, 24, 18, 10, 11, 12, 8 ; t40a t55a 5245 ITX_MUL2X_PACK 29, 23, 19, 10, 11, 12, 8 ; t41 t54 5246 packssdw m18, m19 ; t40a t41 5247 packssdw m24, m23 ; t55a t54 5248 ITX_MUL2X_PACK 17, 23, 19, 10, 11, 12, 8 ; t43 t52 5249 ITX_MUL2X_PACK 9, 29, 17, 10, 11, 12, 8 ; t42a t53a 5250 packssdw m19, m17 ; t43 t42a 5251 packssdw m23, m29 ; t52 t53a 5252 punpcklqdq m17, m25, m27 ; t39 t38a 5253 punpckhqdq m25, m27 ; t56 t57a 5254 punpckhqdq m27, m15, m26 ; t60 t61a 5255 punpcklqdq m15, m26 ; t35 t34a 5256 punpckhqdq m26, m16, m28 ; t59a t58 5257 punpcklqdq m16, m28 ; t36a t37 5258 punpckhqdq m28, m14, m8 ; t63a t62 5259 punpcklqdq m14, m8 ; t32a t33 5260 psubsw m29, m0, m28 ; out63 out62 5261 paddsw m0, m28 ; out0 out1 5262 psubsw m28, m1, m27 ; out60 out61 5263 paddsw m1, m27 ; out3 out2 5264 psubsw m27, m2, m26 ; out59 out58 5265 paddsw m2, m26 ; out4 out5 5266 psubsw m26, m3, m25 ; out56 out57 5267 paddsw m3, m25 ; out7 out6 5268 psubsw m25, m4, m24 ; out55 out54 5269 paddsw m4, m24 ; out8 out9 5270 psubsw m24, m5, m23 ; out52 out53 5271 paddsw m5, m23 ; out11 out10 5272 psubsw m23, m6, m22 ; out51 out50 5273 paddsw m6, m22 ; out12 out13 5274 psubsw m22, m7, m13 ; out48 out49 5275 paddsw m7, m13 ; out15 out14 5276 ret 5277 5278cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob 5279%undef cmp 5280 lea r5, [o_base] 5281 test eobd, eobd 5282 jnz .normal 5283 movsx r6d, word [cq] 5284 mov [cq], eobd 5285 mov r3d, 16 5286.dconly: 5287 imul r6d, 181 5288 add r6d, 128+512 5289 sar r6d, 8+2 5290.dconly2: 5291 imul r6d, 181 5292 add r6d, 128+2048 5293 sar r6d, 8+4 5294 pxor m2, m2 5295 vpbroadcastw m3, r6d 5296.dconly_loop: 5297 mova m1, [dstq] 5298 punpcklbw m0, m1, m2 5299 punpckhbw m1, m2 5300 paddw m0, m3 5301 paddw m1, m3 5302 packuswb m0, m1 5303 mova [dstq], m0 5304 add dstq, strideq 5305 dec r3d 5306 jg .dconly_loop 5307 RET 5308.normal: 5309 WIN64_SPILL_XMM 31 5310 mova m19, [o(dup16_perm)] 5311 mova m24, [cq+64* 2] 5312 mova m28, [cq+64* 6] 5313 mova m26, [cq+64* 4] 5314 mova m22, [cq+64* 0] 5315 mova m23, [cq+64* 1] 5316 mova m29, [cq+64* 7] 5317 mova m27, [cq+64* 5] 5318 mova m25, [cq+64* 3] 5319 vpermb m8, m19, m24 ; 4 5320 vpermb m1, m19, m28 ; 12 5321 vpermb m7, m19, m26 ; 8 5322 vpermb m9, m19, m22 ; __ 0 5323 vpermb m21, m19, m23 ; 2 5324 vpermb m15, m19, m29 ; 14 5325 vpermb m18, m19, m27 ; 10 5326 vpermb m14, m19, m25 ; 6 5327 pslld m9, 16 5328 vpord m30, m19, [o(pb_32)] {1to16} 5329 REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23 5330 cmp eobd, 151 5331 jb .fast 5332 vpermb m0, m19, [cq+64*14] ; 28 5333 vpermb m5, m19, [cq+64*10] ; 20 5334 vpermb m3, m19, [cq+64*12] ; 24 5335 vpermb m6, m19, [cq+64* 8] ; __ 16 5336 pslld m6, 16 5337 call m(idct_16x16_internal_8bpc).main_fast 5338 vpermb m17, m19, [cq+64*15] ; 30 5339 vpermb m20, m19, [cq+64* 9] ; 18 5340 vpermb m16, m19, [cq+64*11] ; 22 5341 vpermb m19, m19, [cq+64*13] ; 26 5342 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 5343 mova [cq+64* 0], m14 5344 mova [cq+64* 1], m15 5345 mova [cq+64* 2], m16 5346 mova [cq+64* 3], m17 5347 mova [cq+64* 4], m18 5348 mova [cq+64* 5], m19 5349 mova [cq+64* 6], m20 5350 mova [cq+64* 7], m21 5351 vpermb m21, m30, [cq+64*15] 5352 vpermb m14, m30, [cq+64* 8] 5353 vpermb m17, m30, [cq+64*11] 5354 vpermb m18, m30, [cq+64*12] 5355 vpermb m19, m30, [cq+64*13] 5356 vpermb m16, m30, [cq+64*10] 5357 vpermb m15, m30, [cq+64* 9] 5358 vpermb m20, m30, [cq+64*14] 5359 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf 5360 jmp .end 5361.fast: ; bottom half is zero 5362 call m(idct_16x16_internal_8bpc).main_fast2 5363 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 5364 mova [cq+64* 0], m14 5365 mova [cq+64* 1], m15 5366 mova [cq+64* 2], m16 5367 mova [cq+64* 3], m17 5368 mova [cq+64* 4], m18 5369 mova [cq+64* 5], m19 5370 mova [cq+64* 6], m20 5371 mova [cq+64* 7], m21 5372 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast 5373.end: 5374 mova [cq+64* 8], m4 5375 mova [cq+64* 9], m5 5376 mova [cq+64*10], m6 5377 mova [cq+64*11], m7 5378 mova [cq+64*12], m26 5379 mova [cq+64*13], m27 5380 mova [cq+64*14], m28 5381 mova [cq+64*15], m29 5382 vpbroadcastd m13, [o(pw_8192)] 5383 call .pass1_end 5384 call .pass2 5385 mova [cq+64* 0], m0 5386 mova [cq+64* 1], m1 5387 mova [cq+64* 2], m2 5388 mova [cq+64* 3], m3 5389 mova [cq+64* 4], m4 5390 mova [cq+64* 5], m5 5391 mova [cq+64* 6], m6 5392 mova [cq+64* 7], m7 5393 pmulhrsw m0, m13, [cq+64* 8] 5394 pmulhrsw m1, m13, [cq+64* 9] 5395 pmulhrsw m2, m13, [cq+64*10] 5396 pmulhrsw m3, m13, [cq+64*11] 5397 vpbroadcastd m30, [o(pw_2048)] 5398 pmulhrsw m4, m13, m22 5399 pmulhrsw m5, m13, m23 5400 pmulhrsw m6, m13, m24 5401 pmulhrsw m7, m13, m25 5402 pmulhrsw m22, m30, m14 5403 pmulhrsw m14, m13, m26 5404 pmulhrsw m23, m30, m15 5405 pmulhrsw m15, m13, m27 5406 pmulhrsw m24, m30, m16 5407 pmulhrsw m16, m13, m28 5408 pmulhrsw m25, m30, m17 5409 pmulhrsw m17, m13, m29 5410 pmulhrsw m26, m30, m18 5411 pmulhrsw m18, m13, [cq+64*12] 5412 pmulhrsw m27, m30, m19 5413 pmulhrsw m19, m13, [cq+64*13] 5414 pmulhrsw m28, m30, m20 5415 pmulhrsw m20, m13, [cq+64*14] 5416 pmulhrsw m29, m30, m21 5417 pmulhrsw m21, m13, [cq+64*15] 5418 call .transpose_round 5419 call .pass2 5420 pxor m10, m10 5421 lea r3, [strideq*3] 5422%macro IDCT_64x16_END 4 5423 mova m9, [dstq+%4] 5424%if %1 < 8 5425 pmulhrsw m%3, m30, [cq+64*%1] 5426%endif 5427 pmulhrsw m%2, m30 5428 mova [cq+64*%1], m10 5429 punpcklbw m8, m9, m10 5430 punpckhbw m9, m10 5431 paddw m8, m%3 5432 paddw m9, m%2 5433 packuswb m8, m9 5434 mova [dstq+%4], m8 5435%if %1 == 3 || %1 == 7 || %1 == 11 5436 lea dstq, [dstq+strideq*4] 5437%endif 5438%endmacro 5439 IDCT_64x16_END 0, 0, 11, strideq*0 5440 IDCT_64x16_END 1, 1, 11, strideq*1 5441 IDCT_64x16_END 2, 2, 11, strideq*2 5442 IDCT_64x16_END 3, 3, 11, r3 5443 IDCT_64x16_END 4, 4, 11, strideq*0 5444 IDCT_64x16_END 5, 5, 11, strideq*1 5445 IDCT_64x16_END 6, 6, 11, strideq*2 5446 IDCT_64x16_END 7, 7, 11, r3 5447 IDCT_64x16_END 8, 14, 22, strideq*0 5448 IDCT_64x16_END 9, 15, 23, strideq*1 5449 IDCT_64x16_END 10, 16, 24, strideq*2 5450 IDCT_64x16_END 11, 17, 25, r3 5451 IDCT_64x16_END 12, 18, 26, strideq*0 5452 IDCT_64x16_END 13, 19, 27, strideq*1 5453 IDCT_64x16_END 14, 20, 28, strideq*2 5454 IDCT_64x16_END 15, 21, 29, r3 5455 RET 5456ALIGN function_align 5457.pass1_end: 5458 mova m4, [cq+64* 0] 5459 mova m5, [cq+64* 1] 5460 mova m6, [cq+64* 2] 5461 mova m7, [cq+64* 3] 5462 mova m8, [cq+64* 4] 5463 mova m9, [cq+64* 5] 5464 mova m11, [cq+64* 6] 5465 mova m12, [cq+64* 7] 5466 psubsw m29, m4, m21 ; out47 out46 5467 paddsw m4, m21 ; out16 out17 5468 psubsw m28, m5, m20 ; out44 out45 5469 paddsw m5, m20 ; out19 out18 5470 REPX {pmulhrsw x, m13}, m0, m1, m2, m3 5471 psubsw m27, m6, m19 ; out43 out42 5472 paddsw m6, m19 ; out20 out21 5473 psubsw m26, m7, m18 ; out40 out41 5474 paddsw m7, m18 ; out23 out22 5475 pmulhrsw m18, m13, m22 5476 pmulhrsw m19, m13, m23 5477 pmulhrsw m20, m13, m24 5478 pmulhrsw m21, m13, m25 5479 paddsw m25, m12, m14 ; out31 out30 5480 psubsw m14, m12, m14 ; out32 out33 5481 paddsw m24, m11, m15 ; out28 out29 5482 psubsw m15, m11, m15 ; out35 out34 5483 REPX {pmulhrsw x, m13}, m4, m5, m6, m7 5484 paddsw m23, m9, m16 ; out27 out26 5485 psubsw m16, m9, m16 ; out36 out37 5486 paddsw m22, m8, m17 ; out24 out25 5487 psubsw m17, m8, m17 ; out39 out38 5488 REPX {pmulhrsw x, m13}, m14, m15, m16, m17 5489.transpose_round: 5490%macro TRANSPOSE_8x4_PACKED 4 5491 punpckhwd m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3 5492 punpcklwd m%1, m%3 ; a0 e0 a1 e1 a2 e2 a3 e3 5493 punpcklwd m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3 5494 punpckhwd m%2, m%4 ; c0 g0 c1 g1 c2 g2 c3 g3 5495 punpckhwd m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3 5496 punpcklwd m%1, m%2 ; a0 c0 e0 g0 a1 c1 e1 g1 5497 punpckhwd m%2, m8, m%3 ; b2 d2 f2 h2 b3 d3 f3 h3 5498 punpcklwd m8, m%3 ; b0 d0 f0 h0 b1 d1 f1 h1 5499 punpcklwd m%3, m%4, m%2 ; 2 5500 punpckhwd m%4, m%2 ; 3 5501 punpckhwd m%2, m%1, m8 ; 1 5502 punpcklwd m%1, m8 ; 0 5503%endmacro 5504 TRANSPOSE_8x4_PACKED 0, 1, 2, 3 5505 TRANSPOSE_8x4_PACKED 18, 19, 20, 21 5506 TRANSPOSE_8x4_PACKED 4, 5, 6, 7 5507 TRANSPOSE_8x4_PACKED 14, 15, 16, 17 5508 vshufi32x4 m8, m0, m4, q3232 ; a02 a03 b02 b03 5509 vinserti32x8 m0, ym4, 1 ; a00 a01 b00 b01 5510 vshufi32x4 m4, m1, m5, q3232 ; a12 a13 b12 b13 5511 vinserti32x8 m9, m1, ym5, 1 ; a10 a11 b10 b11 5512 vshufi32x4 m5, m2, m6, q3232 ; a22 a23 b22 b23 5513 vinserti32x8 m1, m2, ym6, 1 ; a20 a21 b20 b21 5514 vshufi32x4 m6, m3, m7, q3232 ; a32 a33 b32 b33 5515 vinserti32x8 m11, m3, ym7, 1 ; a30 a31 b30 b31 5516 vshufi32x4 m2, m14, m18, q3232 ; c02 c03 d02 d03 5517 vinserti32x8 m3, m14, ym18, 1 ; c00 c01 d00 d01 5518 vshufi32x4 m18, m15, m19, q3232 ; c12 c13 d12 d13 5519 vinserti32x8 m15, ym19, 1 ; c10 c11 d10 d11 5520 vshufi32x4 m19, m16, m20, q3232 ; c22 c23 d22 d23 5521 vinserti32x8 m16, ym20, 1 ; c20 c21 d20 d21 5522 vshufi32x4 m20, m17, m21, q3232 ; c32 c33 d32 d33 5523 vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31 5524 ret 5525.pass2: 5526 vshufi32x4 m7, m5, m19, q3131 ; 14 5527 vshufi32x4 m5, m19, q2020 ; 10 5528 vshufi32x4 m21, m6, m20, q3131 ; 15 5529 vshufi32x4 m19, m6, m20, q2020 ; 11 5530 vshufi32x4 m20, m4, m18, q3131 ; 13 5531 vshufi32x4 m18, m4, m18, q2020 ; 9 5532 vshufi32x4 m6, m8, m2, q3131 ; 12 5533 vshufi32x4 m4, m8, m2, q2020 ; 8 5534 vshufi32x4 m2, m0, m3, q3131 ; 4 5535 vshufi32x4 m0, m3, q2020 ; 0 5536 vshufi32x4 m3, m1, m16, q3131 ; 6 5537 vshufi32x4 m1, m16, q2020 ; 2 5538 vshufi32x4 m16, m9, m15, q3131 ; 5 5539 vshufi32x4 m14, m9, m15, q2020 ; 1 5540 vshufi32x4 m15, m11, m17, q2020 ; 3 5541 vshufi32x4 m17, m11, m17, q3131 ; 7 5542 call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 5543 jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 5544 5545cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob 5546 lea r5, [o_base] 5547 test eobd, eobd 5548 jz .dconly 5549 PROLOGUE 0, 9, 30, 64*32, dst, stride, c, eob 5550 vpbroadcastd m23, [o(pw_2896x8)] 5551%undef cmp 5552 cmp eobd, 136 5553 jb .fast 5554 pmulhrsw m5, m23, [cq+64*20] 5555 pmulhrsw m3, m23, [cq+64*12] 5556 pmulhrsw m1, m23, [cq+64* 4] 5557 pmulhrsw m7, m23, [cq+64*28] 5558 pmulhrsw m2, m23, [cq+64* 8] 5559 pmulhrsw m6, m23, [cq+64*24] 5560 pmulhrsw m0, m23, [cq+64* 0] 5561 pmulhrsw m4, m23, [cq+64*16] 5562 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 5563 pmulhrsw m14, m23, [cq+64* 2] 5564 pmulhrsw m21, m23, [cq+64*30] 5565 pmulhrsw m18, m23, [cq+64*18] 5566 pmulhrsw m17, m23, [cq+64*14] 5567 pmulhrsw m16, m23, [cq+64*10] 5568 pmulhrsw m19, m23, [cq+64*22] 5569 pmulhrsw m20, m23, [cq+64*26] 5570 pmulhrsw m15, m23, [cq+64* 6] 5571 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 5572 mova [cq+64* 0], m14 5573 mova [cq+64* 2], m15 5574 mova [cq+64* 4], m16 5575 mova [cq+64* 6], m17 5576 mova [cq+64* 8], m18 5577 mova [cq+64*10], m19 5578 mova [cq+64*12], m20 5579 mova [cq+64*14], m21 5580 pmulhrsw m22, m23, [cq+64* 1] 5581 pmulhrsw m21, m23, [cq+64*31] 5582 pmulhrsw m14, m23, [cq+64*17] 5583 pmulhrsw m29, m23, [cq+64*15] 5584 pmulhrsw m26, m23, [cq+64* 9] 5585 pmulhrsw m17, m23, [cq+64*23] 5586 pmulhrsw m18, m23, [cq+64*25] 5587 pmulhrsw m25, m23, [cq+64* 7] 5588 pmulhrsw m24, m23, [cq+64* 5] 5589 pmulhrsw m19, m23, [cq+64*27] 5590 pmulhrsw m16, m23, [cq+64*21] 5591 pmulhrsw m27, m23, [cq+64*11] 5592 pmulhrsw m28, m23, [cq+64*13] 5593 pmulhrsw m15, m23, [cq+64*19] 5594 pmulhrsw m20, m23, [cq+64*29] 5595 pmulhrsw m23, [cq+64* 3] 5596 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf 5597 vpbroadcastd m12, [o(pw_16384)] 5598 psubsw m13, m0, m29 ; 31 5599 paddsw m0, m29 ; 0 5600 psubsw m29, m1, m28 ; 30 5601 paddsw m1, m28 ; 1 5602 psubsw m28, m2, m27 ; 29 5603 paddsw m2, m27 ; 2 5604 psubsw m27, m3, m26 ; 28 5605 paddsw m3, m26 ; 3 5606 psubsw m26, m4, m25 ; 27 5607 paddsw m4, m25 ; 4 5608 psubsw m25, m5, m24 ; 26 5609 paddsw m5, m24 ; 5 5610 psubsw m24, m6, m23 ; 25 5611 paddsw m6, m23 ; 6 5612 psubsw m23, m7, m22 ; 24 5613 paddsw m7, m22 ; 7 5614 pxor m9, m9 5615 punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 5616 punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 5617 punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 5618 punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 5619 REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 5620 punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 5621 punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 5622 punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 5623 punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 5624 REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 5625 punpckhwd m3, m23, m24 5626 punpcklwd m23, m24 5627 punpckhwd m24, m25, m26 5628 punpcklwd m25, m26 5629 REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 5630 punpckhwd m26, m27, m28 5631 punpcklwd m27, m28 5632 punpckhwd m28, m29, m13 5633 punpcklwd m29, m13 5634 REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 5635 punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 5636 punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 5637 punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 5638 punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 5639 REPX {pmulhrsw x, m12}, m7, m0, m2, m4 5640 punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 5641 punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 5642 punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 5643 punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 5644 REPX {pmulhrsw x, m12}, m6, m8, m1, m22 5645 punpckhdq m13, m23, m25 5646 punpckldq m23, m25 5647 punpckhdq m25, m27, m29 5648 punpckldq m27, m29 5649 REPX {pmulhrsw x, m12}, m13, m23, m25, m27 5650 punpckhdq m9, m3, m24 5651 punpckldq m3, m24 5652 punpckhdq m24, m26, m28 5653 punpckldq m26, m28 5654 REPX {pmulhrsw x, m12}, m9, m3, m24, m26 5655 punpckhqdq m5, m23, m27 ; d01 d09 d17 d25 5656 punpcklqdq m23, m27 ; d00 d08 d16 d24 5657 punpcklqdq m27, m13, m25 ; d02 d10 d18 d26 5658 punpckhqdq m13, m25 ; d03 d11 d19 d27 5659 punpcklqdq m25, m3, m26 ; d04 d12 d20 d28 5660 punpckhqdq m3, m26 ; d05 d13 d21 d29 5661 punpcklqdq m26, m9, m24 ; d06 d14 d22 d30 5662 punpckhqdq m9, m24 ; d07 d15 d23 d31 5663 mova [cq+64* 3], m23 5664 mova [cq+64*13], m27 5665 mova [cq+64* 7], m25 5666 mova [cq+64*15], m26 5667 punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 5668 punpcklqdq m8, m22 ; a04 a12 a20 a28 5669 punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 5670 punpcklqdq m0, m4 ; a00 a08 a16 a24 5671 punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 5672 punpcklqdq m7, m2 ; a02 a10 a18 a26 5673 punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 5674 punpcklqdq m6, m1 ; a06 a14 a22 a30 5675 mova [cq+64* 1], m0 5676 mova [cq+64* 9], m7 5677 mova [cq+64* 5], m8 5678 mova [cq+64*11], m6 5679 mova m2, [cq+64* 0] 5680 mova m11, [cq+64* 2] 5681 mova m8, [cq+64* 4] 5682 mova m29, [cq+64* 6] 5683 mova m27, [cq+64* 8] 5684 mova m26, [cq+64*10] 5685 mova m4, [cq+64*12] 5686 mova m28, [cq+64*14] 5687 psubsw m1, m2, m21 ; 23 5688 paddsw m2, m21 ; 8 5689 psubsw m21, m11, m20 ; 22 5690 paddsw m11, m20 ; 9 5691 psubsw m20, m8, m19 ; 21 5692 paddsw m8, m19 ; 10 5693 psubsw m19, m29, m18 ; 20 5694 paddsw m29, m18 ; 11 5695 psubsw m18, m27, m17 ; 19 5696 paddsw m27, m17 ; 12 5697 psubsw m17, m26, m16 ; 18 5698 paddsw m26, m16 ; 13 5699 psubsw m16, m4, m15 ; 17 5700 paddsw m4, m15 ; 14 5701 psubsw m15, m28, m14 ; 16 5702 paddsw m28, m14 ; 15 5703 punpcklwd m14, m15, m16 5704 punpckhwd m15, m16 5705 punpckhwd m16, m17, m18 5706 punpcklwd m17, m18 5707 punpckhwd m18, m19, m20 5708 punpcklwd m19, m20 5709 punpckhwd m20, m21, m1 5710 punpcklwd m21, m1 5711 punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 5712 punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 5713 punpckhwd m11, m8, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 5714 punpcklwd m8, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 5715 punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 5716 punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 5717 punpckhwd m26, m4, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 5718 punpcklwd m4, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 5719 punpckhdq m28, m2, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 5720 punpckldq m2, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 5721 punpckhdq m8, m27, m4 ; m2 n2 o2 p2 m3 n3 o3 p3 5722 punpckldq m27, m4 ; m0 n0 o0 p0 m1 n1 o1 p1 5723 REPX {pmulhrsw x, m12}, m28, m2, m8, m27 5724 punpckhdq m4, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 5725 punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 5726 punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 5727 punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 5728 REPX {pmulhrsw x, m12}, m4, m1, m11, m29 5729 punpckhdq m26, m19, m21 5730 punpckldq m19, m21 5731 punpckhdq m21, m15, m16 5732 punpckldq m15, m16 5733 REPX {pmulhrsw x, m12}, m26, m19, m21, m15 5734 punpckhdq m16, m18, m20 5735 punpckldq m18, m20 5736 punpckhdq m20, m14, m17 5737 punpckldq m14, m17 5738 REPX {pmulhrsw x, m12}, m16, m18, m20, m14 5739 punpckhqdq m17, m28, m8 ; b03 b11 b19 b27 5740 punpcklqdq m28, m8 ; b02 b10 b18 b26 5741 punpckhqdq m8, m2, m27 ; b01 b09 b17 b25 5742 punpcklqdq m2, m27 ; b00 b08 b16 b24 5743 punpcklqdq m27, m1, m29 ; b04 b12 b20 b28 5744 punpckhqdq m1, m29 ; b05 b13 b21 b29 5745 punpcklqdq m29, m4, m11 ; b06 b14 b22 b30 5746 punpckhqdq m4, m11 ; b07 b15 b23 b31 5747 mova [cq+64* 0], m2 5748 mova [cq+64* 8], m28 5749 mova [cq+64* 4], m27 5750 mova [cq+64*10], m29 5751 punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 5752 punpcklqdq m20, m26 ; c02 c10 c18 c26 5753 punpckhqdq m26, m14, m19 ; c01 c09 c17 c25 5754 punpcklqdq m14, m19 ; c00 c08 c16 c24 5755 punpckhqdq m28, m15, m18 ; c05 c13 c21 c29 5756 punpcklqdq m15, m18 ; c04 c12 c20 c28 5757 punpckhqdq m29, m21, m16 ; c07 c15 c23 c31 5758 punpcklqdq m21, m16 ; c06 c14 c22 c30 5759 mova [cq+64* 2], m14 5760 mova [cq+64*12], m20 5761 mova [cq+64* 6], m15 5762 mova [cq+64*14], m21 5763 vshufi32x4 m14, m22, m8, q3232 ; a17 a25 b17 b25 5764 vinserti32x8 m22, ym8, 1 ; a01 a09 b01 b09 5765 vshufi32x4 m15, m23, m17, q3232 ; a19 a27 b19 b27 5766 vinserti32x8 m23, ym17, 1 ; a03 a11 b03 b11 5767 vshufi32x4 m16, m24, m1, q3232 ; a21 a29 b21 b29 5768 vinserti32x8 m24, ym1, 1 ; a05 a13 b05 b13 5769 vshufi32x4 m17, m25, m4, q3232 ; a23 a31 b23 b31 5770 vinserti32x8 m25, ym4, 1 ; a07 a15 b07 b15 5771 vinserti32x8 m19, m26, ym5, 1 ; c01 c09 d01 d09 5772 vshufi32x4 m26, m5, q3232 ; c17 c25 d17 d25 5773 vinserti32x8 m20, m27, ym13, 1 ; c03 c11 d03 d11 5774 vshufi32x4 m27, m13, q3232 ; c19 c27 d19 d27 5775 vinserti32x8 m21, m28, ym3, 1 ; c05 c13 d05 d13 5776 vshufi32x4 m28, m3, q3232 ; c21 c29 d21 d29 5777 vinserti32x8 m18, m29, ym9, 1 ; c07 c15 d07 d15 5778 vshufi32x4 m29, m9, q3232 ; c23 c31 d23 d31 5779 mov r4, rsp 5780 vshufi32x4 m0, m22, m19, q2020 ; 1 5781 vshufi32x4 m1, m17, m29, q3131 ; 31 5782 vshufi32x4 m2, m14, m26, q2020 ; 17 5783 vshufi32x4 m3, m25, m18, q3131 ; 15 5784 call .main_part1 5785 vshufi32x4 m0, m25, m18, q2020 ; 7 5786 vshufi32x4 m1, m14, m26, q3131 ; 25 5787 vshufi32x4 m2, m17, m29, q2020 ; 23 5788 vshufi32x4 m3, m22, m19, q3131 ; 9 5789 call .main_part1 5790 vshufi32x4 m0, m24, m21, q2020 ; 5 5791 vshufi32x4 m1, m15, m27, q3131 ; 27 5792 vshufi32x4 m2, m16, m28, q2020 ; 21 5793 vshufi32x4 m3, m23, m20, q3131 ; 11 5794 call .main_part1 5795 vshufi32x4 m0, m23, m20, q2020 ; 3 5796 vshufi32x4 m1, m16, m28, q3131 ; 29 5797 vshufi32x4 m2, m15, m27, q2020 ; 19 5798 vshufi32x4 m3, m24, m21, q3131 ; 13 5799 call .main_part1 5800 call .main_part2 5801 mova m0, [cq+64* 1] ; a0 5802 mova m15, [cq+64* 0] ; b0 5803 mova m3, [cq+64* 2] ; c0 5804 mova m16, [cq+64* 3] ; d0 5805 mova m14, [cq+64* 5] ; a4 5806 mova m8, [cq+64* 4] ; b4 5807 mova m17, [cq+64* 6] ; c4 5808 mova m1, [cq+64* 7] ; d4 5809 vshufi32x4 m2, m0, m15, q3232 ; a16 a24 b16 b24 5810 vinserti32x8 m0, ym15, 1 ; a00 a08 b00 b08 5811 vshufi32x4 m15, m3, m16, q3232 ; c16 c24 d16 d24 5812 vinserti32x8 m3, ym16, 1 ; c00 c08 d00 d08 5813 vshufi32x4 m16, m14, m8, q3232 ; a20 a28 b20 b28 5814 vinserti32x8 m14, ym8, 1 ; a04 a12 b04 b12 5815 vshufi32x4 m8, m17, m1, q3232 ; c20 c28 d20 d28 5816 vinserti32x8 m17, ym1, 1 ; c04 c12 d04 d12 5817 vshufi32x4 m1, m0, m3, q3131 ; 8 5818 vshufi32x4 m0, m3, q2020 ; 0 5819 vshufi32x4 m3, m2, m15, q3131 ; 24 5820 vshufi32x4 m2, m15, q2020 ; 16 5821 vshufi32x4 m15, m14, m17, q3131 ; 12 5822 vshufi32x4 m14, m17, q2020 ; 4 5823 vshufi32x4 m17, m16, m8, q3131 ; 28 5824 vshufi32x4 m16, m8, q2020 ; 20 5825 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 5826 mova m8, [cq+64* 8] 5827 mova m9, [cq+64*12] 5828 mova m11, [cq+64*10] 5829 mova m12, [cq+64*14] 5830 mova [cq+64* 0], m14 5831 mova [cq+64* 2], m15 5832 mova [cq+64* 4], m16 5833 mova [cq+64* 6], m17 5834 mova [cq+64* 8], m18 5835 mova [cq+64*10], m19 5836 mova [cq+64*12], m20 5837 mova [cq+64*14], m21 5838 mova m22, [cq+64* 9] 5839 mova m27, [cq+64*13] 5840 mova m23, [cq+64*11] 5841 mova m24, [cq+64*15] 5842 vshufi32x4 m26, m22, m8, q3232 ; a18 a26 b18 b26 5843 vinserti32x8 m22, ym8, 1 ; a02 a10 b02 b10 5844 vshufi32x4 m8, m9, m27, q3232 ; c18 c26 d18 d26 5845 vinserti32x8 m9, ym27, 1 ; c02 c10 d02 d10 5846 vshufi32x4 m27, m23, m11, q3232 ; a22 a30 b22 b30 5847 vinserti32x8 m23, ym11, 1 ; a06 a14 b06 b14 5848 vshufi32x4 m11, m12, m24, q3232 ; c22 c30 d22 d30 5849 vinserti32x8 m12, ym24, 1 ; c06 c14 d06 d14 5850 vshufi32x4 m28, m26, m8, q3131 ; 26 5851 vshufi32x4 m26, m8, q2020 ; 18 5852 vshufi32x4 m24, m22, m9, q3131 ; 10 5853 vshufi32x4 m22, m9, q2020 ; 2 5854 vshufi32x4 m29, m27, m11, q3131 ; 30 5855 vshufi32x4 m27, m11, q2020 ; 22 5856 vshufi32x4 m25, m23, m12, q3131 ; 14 5857 vshufi32x4 m23, m12, q2020 ; 6 5858 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 5859 jmp .end 5860.fast: ; bottom/right halves are zero 5861 pmulhrsw ym9, ym23, [cq+64* 0] 5862 pmulhrsw ym6, ym23, [cq+64* 8] 5863 mova m14, [o(dup16_perm)] 5864 pmulhrsw ym8, ym23, [cq+64* 2] 5865 pmulhrsw xm0, xm23, [cq+64*14] 5866 pmulhrsw xm5, xm23, [cq+64*10] 5867 pmulhrsw ym1, ym23, [cq+64* 6] 5868 pmulhrsw ym7, ym23, [cq+64* 4] 5869 pmulhrsw xm3, xm23, [cq+64*12] 5870 pmovzxwd m9, ym9 5871 pmovzxwd m6, ym6 5872 vpermb m8, m14, m8 5873 punpcklwd xm0, xm0 5874 vpermb ym5, ym14, ym5 5875 vpermb m1, m14, m1 5876 vpermb m7, m14, m7 5877 punpcklwd xm3, xm3 5878 pslld m9, 16 5879 pslld m6, 16 5880 call m(idct_16x16_internal_8bpc).main_fast 5881 vpmulhrsw ym21, ym23, [cq+64* 1] 5882 {evex}vpmulhrsw xm17, xm23, [cq+64*15] ; force EVEX encoding, which 5883 {evex}vpmulhrsw xm20, xm23, [cq+64* 9] ; reduces code size due to 5884 {evex}vpmulhrsw ym15, ym23, [cq+64* 7] ; compressed displacements 5885 {evex}vpmulhrsw ym18, ym23, [cq+64* 5] 5886 {evex}vpmulhrsw xm16, xm23, [cq+64*11] 5887 {evex}vpmulhrsw xm19, xm23, [cq+64*13] 5888 {evex}vpmulhrsw ym23, [cq+64* 3] 5889 vpermb m21, m14, m21 5890 punpcklwd xm17, xm17 5891 vpermb ym20, ym14, ym20 5892 vpermb m15, m14, m15 5893 vpermb m18, m14, m18 5894 vpermb ym16, ym14, ym16 5895 punpcklwd xm19, xm19 5896 vpermb m14, m14, m23 5897 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 5898 vpbroadcastd m9, [o(pw_16384)] 5899 call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round 5900 vshufi32x4 m16, m0, m3, q2020 ; 0 5901 vshufi32x4 m26, m0, m3, q3131 ; 4 5902 vshufi32x4 m0, m14, m2, q2020 ; 1 5903 vshufi32x4 m14, m2, q3131 ; 5 5904 vshufi32x4 m3, m19, m7, q3131 ; 15 5905 vshufi32x4 m19, m7, q2020 ; 11 5906 vshufi32x4 m27, m17, m9, q2020 ; 3 5907 vshufi32x4 m17, m9, q3131 ; 7 5908 vshufi32x4 m28, m20, m6, q2020 ; 9 5909 vshufi32x4 m20, m6, q3131 ; 13 5910 vshufi32x4 m22, m1, m18, q2020 ; 2 5911 vshufi32x4 m23, m1, m18, q3131 ; 6 5912 vshufi32x4 m24, m5, m15, q2020 ; 10 5913 vshufi32x4 m25, m5, m15, q3131 ; 14 5914 vshufi32x4 m15, m21, m4, q3131 ; 12 5915 vshufi32x4 m21, m21, m4, q2020 ; 8 5916 mov r4, rsp 5917 call .main_part1_fast 5918 mova m0, m17 5919 mova m3, m28 5920 call .main_part1_fast 5921 mova m0, m14 5922 mova m3, m19 5923 call .main_part1_fast 5924 mova m0, m27 5925 mova m3, m20 5926 call .main_part1_fast 5927 call .main_part2 5928 mova m0, m16 5929 mova m1, m21 5930 mova m14, m26 5931 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 5932 mova [cq+64*14], m21 5933 mova [cq+64* 0], m14 5934 mova [cq+64* 6], m17 5935 mova [cq+64* 8], m18 5936 mova [cq+64*10], m19 5937 mova [cq+64* 4], m16 5938 mova [cq+64* 2], m15 5939 mova [cq+64*12], m20 5940 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 5941.end: 5942 lea r4, [strideq*3] 5943 vpbroadcastd m12, [o(pw_2048)] 5944 movshdup m13, [o(permD)] 5945 lea r5, [r4+strideq] ; stride*4 5946 lea r3, [dstq+r4*8] 5947 lea r6, [strideq+r5*8] ; stride*33 5948 lea r8, [r4+r5*8] ; stride*35 5949 add r3, r5 ; dst+stride*28 5950 lea r7, [r6+strideq] ; stride*34 5951%macro IDCT_32x64_END 6 ; src, mem, stride[1-4] 5952%if %2 < 8 5953 paddsw m10, m%2, m%1 5954 psubsw m11, m%2, m%1 5955%else 5956 mova m11, [cq+64*(%2*2-16)] 5957 paddsw m10, m11, m%1 5958 psubsw m11, m%1 5959%endif 5960 mova m9, [rsp+64*(31-%2)] 5961 mova m%1, [rsp+64*%2] 5962 paddsw m8, m10, m9 5963 psubsw m10, m9 5964 paddsw m9, m11, m%1 5965 pmovzxbw m0, [dstq+%3] 5966 psubsw m11, m%1 5967 pmovzxbw m%1, [r3 +%4] 5968 REPX {pmulhrsw x, m12}, m8, m10, m9, m11 5969 paddw m8, m0 5970 pmovzxbw m0, [r3 +%5] 5971 paddw m10, m%1 5972 pmovzxbw m%1, [dstq+%6] 5973 paddw m9, m0 5974 paddw m11, m%1 5975%if %2 >= 8 5976%if %2 == 8 5977 pxor m1, m1 5978%endif 5979 mova [cq+64*(%2*2-16)], m1 5980 mova [cq+64*(%2*2-15)], m1 5981%endif 5982 packuswb m8, m10 5983 packuswb m9, m11 5984 vpermq m8, m13, m8 5985 vpermq m9, m13, m9 5986 mova [dstq+%3], ym8 5987 vextracti32x8 [r3 +%4], m8, 1 5988 mova [r3 +%5], ym9 5989 vextracti32x8 [dstq+%6], m9, 1 5990%if %2 == 3 || %2 == 7 || %2 == 11 5991 add dstq, r5 5992 sub r3, r5 5993%endif 5994%endmacro 5995 IDCT_32x64_END 29, 0, strideq*0, r8, r4 , r5*8 5996 IDCT_32x64_END 28, 1, strideq*1, r7, strideq*2, r6 5997 IDCT_32x64_END 27, 2, strideq*2, r6, strideq*1, r7 5998 IDCT_32x64_END 26, 3, r4 , r5*8, strideq*0, r8 5999 IDCT_32x64_END 25, 4, strideq*0, r8, r4 , r5*8 6000 IDCT_32x64_END 24, 5, strideq*1, r7, strideq*2, r6 6001 IDCT_32x64_END 23, 6, strideq*2, r6, strideq*1, r7 6002 IDCT_32x64_END 22, 7, r4 , r5*8, strideq*0, r8 6003 IDCT_32x64_END 21, 8, strideq*0, r8, r4 , r5*8 6004 IDCT_32x64_END 20, 9, strideq*1, r7, strideq*2, r6 6005 IDCT_32x64_END 19, 10, strideq*2, r6, strideq*1, r7 6006 IDCT_32x64_END 18, 11, r4 , r5*8, strideq*0, r8 6007 IDCT_32x64_END 17, 12, strideq*0, r8, r4 , r5*8 6008 IDCT_32x64_END 16, 13, strideq*1, r7, strideq*2, r6 6009 IDCT_32x64_END 15, 14, strideq*2, r6, strideq*1, r7 6010 IDCT_32x64_END 14, 15, r4 , r5*8, strideq*0, r8 6011 RET 6012.dconly: 6013 movsx r6d, word [cq] 6014 mov [cq], eobd 6015 imul r6d, 181 6016 mov r3d, 64 6017 add r6d, 128 6018 sar r6d, 8 6019 imul r6d, 181 6020 add r6d, 128+256 6021 sar r6d, 8+1 6022 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 6023ALIGN function_align ; bottom three-quarters are zero 6024.main_part1_fast: 6025 vpbroadcastd m1, [o(idct64_mul+4*0)] 6026 vpbroadcastd m8, [o(idct64_mul+4*1)] 6027 vpbroadcastd m2, [o(idct64_mul+4*6)] 6028 vpbroadcastd m9, [o(idct64_mul+4*7)] 6029 pmulhrsw m1, m0 ; t63a 6030 pmulhrsw m0, m8 ; t32a 6031 pmulhrsw m2, m3 ; t60a 6032 pmulhrsw m3, m9 ; t35a 6033 mova m8, m0 6034 mova m7, m1 6035 mova m6, m3 6036 mova m5, m2 6037 jmp .main_part1b 6038.main_part1: 6039 ; idct64 steps 1-5: 6040 ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a 6041 ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a 6042 ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a 6043 ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a 6044 vpbroadcastd m7, [o(idct64_mul+4*0)] 6045 vpbroadcastd m8, [o(idct64_mul+4*1)] 6046 vpbroadcastd m6, [o(idct64_mul+4*2)] 6047 vpbroadcastd m9, [o(idct64_mul+4*3)] 6048 pmulhrsw m7, m0 ; t63a 6049 vpbroadcastd m5, [o(idct64_mul+4*4)] 6050 pmulhrsw m0, m8 ; t32a 6051 vpbroadcastd m8, [o(idct64_mul+4*5)] 6052 pmulhrsw m6, m1 ; t62a 6053 vpbroadcastd m4, [o(idct64_mul+4*6)] 6054 pmulhrsw m1, m9 ; t33a 6055 vpbroadcastd m9, [o(idct64_mul+4*7)] 6056 pmulhrsw m5, m2 ; t61a 6057 pmulhrsw m2, m8 ; t34a 6058 pmulhrsw m4, m3 ; t60a 6059 pmulhrsw m3, m9 ; t35a 6060 psubsw m8, m0, m1 ; t33 6061 paddsw m0, m1 ; t32 6062 psubsw m1, m7, m6 ; t62 6063 paddsw m7, m6 ; t63 6064 psubsw m6, m3, m2 ; t34 6065 paddsw m3, m2 ; t35 6066 psubsw m2, m4, m5 ; t61 6067 paddsw m5, m4 ; t60 6068.main_part1b: 6069 vpbroadcastd m11, [o(idct64_mul+4*8)] 6070 vpbroadcastd m12, [o(idct64_mul+4*9)] 6071 ITX_MULSUB_2W 1, 8, 4, 9, 10, 11, 12 ; t33a, t62a 6072 vpbroadcastd m11, [o(idct64_mul+4*10)] 6073 ITX_MULSUB_2W 2, 6, 4, 9, 10, 12, 11 ; t34a, t61a 6074 vpbroadcastd m11, [o(idct64_mul+4*11)] 6075 vpbroadcastd m12, [o(idct64_mul+4*12)] 6076 psubsw m4, m0, m3 ; t35a 6077 paddsw m0, m3 ; t32a 6078 psubsw m3, m7, m5 ; t60a 6079 paddsw m7, m5 ; t63a 6080 psubsw m5, m1, m2 ; t34 6081 paddsw m1, m2 ; t33 6082 psubsw m2, m8, m6 ; t61 6083 paddsw m6, m8 ; t62 6084 add r5, 4*13 6085 ITX_MULSUB_2W 3, 4, 8, 9, 10, 11, 12 ; t35, t60 6086 ITX_MULSUB_2W 2, 5, 8, 9, 10, 11, 12 ; t34a, t61a 6087 mova [r4+64*0], m0 6088 mova [r4+64*7], m7 6089 mova [r4+64*1], m1 6090 mova [r4+64*6], m6 6091 mova [r4+64*3], m3 6092 mova [r4+64*4], m4 6093 mova [r4+64*2], m2 6094 mova [r4+64*5], m5 6095 add r4, 64*8 6096 ret 6097.main_part2: 6098 vpbroadcastd m11, [o(pw_1567_3784 -16*13)] 6099 vpbroadcastd m12, [o(pw_m3784_1567 -16*13)] 6100 lea r6, [r4+64*7] 6101 vpbroadcastd m17, [o(pw_m1567_m3784-16*13)] 6102 vpbroadcastd m18, [o(pw_2896_2896 -16*13)] 6103 vpbroadcastd m19, [o(pw_m2896_2896 -16*13)] 6104 sub r5, 16*13 6105.main_part2_loop: 6106 mova m0, [r4-64*32] ; t32a 6107 mova m1, [r6-64*24] ; t39a 6108 mova m2, [r6-64*32] ; t63a 6109 mova m3, [r4-64*24] ; t56a 6110 mova m4, [r4-64*16] ; t40a 6111 mova m5, [r6-64* 8] ; t47a 6112 mova m6, [r6-64*16] ; t55a 6113 mova m7, [r4-64* 8] ; t48a 6114 psubsw m8, m0, m1 ; t39 6115 paddsw m0, m1 ; t32 6116 psubsw m1, m2, m3 ; t56 6117 paddsw m2, m3 ; t63 6118 psubsw m3, m5, m4 ; t40 6119 paddsw m5, m4 ; t47 6120 psubsw m4, m7, m6 ; t55 6121 paddsw m7, m6 ; t48 6122 ITX_MULSUB_2W 1, 8, 6, 9, 10, 11, 12 ; t39a, t56a 6123 ITX_MULSUB_2W 4, 3, 6, 9, 10, 12, 17 ; t40a, t55a 6124 psubsw m6, m2, m7 ; t48a 6125 paddsw m2, m7 ; t63a 6126 psubsw m7, m0, m5 ; t47a 6127 paddsw m0, m5 ; t32a 6128 psubsw m5, m8, m3 ; t55 6129 paddsw m8, m3 ; t56 6130 psubsw m3, m1, m4 ; t40 6131 paddsw m1, m4 ; t39 6132 ITX_MULSUB_2W 6, 7, 4, 9, 10, 18, 19 ; t47, t48 6133 ITX_MULSUB_2W 5, 3, 4, 9, 10, 18, 19 ; t40a, t55a 6134 mova [r6-64* 8], m2 6135 mova [r4-64*32], m0 6136 mova [r4-64* 8], m8 6137 mova [r6-64*32], m1 6138 mova [r6-64*24], m6 6139 mova [r4-64*16], m7 6140 mova [r4-64*24], m5 6141 mova [r6-64*16], m3 6142 add r4, 64 6143 sub r6, 64 6144 cmp r4, r6 6145 jb .main_part2_loop 6146 ret 6147 6148cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob 6149 lea r5, [o_base] 6150 test eobd, eobd 6151 jz .dconly 6152 PROLOGUE 0, 7, 30, 64*32, dst, stride, c, eob 6153 vpbroadcastd m23, [o(pw_2896x8)] 6154%undef cmp 6155 cmp eobd, 136 6156 jb .fast 6157 pmulhrsw m0, m23, [cq+64* 1] 6158 pmulhrsw m1, m23, [cq+64*31] 6159 pmulhrsw m2, m23, [cq+64*17] 6160 pmulhrsw m3, m23, [cq+64*15] 6161 vpbroadcastd m10, [o(pd_2048)] 6162 mov r4, rsp 6163 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 6164 pmulhrsw m0, m23, [cq+64* 7] 6165 pmulhrsw m1, m23, [cq+64*25] 6166 pmulhrsw m2, m23, [cq+64*23] 6167 pmulhrsw m3, m23, [cq+64* 9] 6168 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 6169 pmulhrsw m0, m23, [cq+64* 5] 6170 pmulhrsw m1, m23, [cq+64*27] 6171 pmulhrsw m2, m23, [cq+64*21] 6172 pmulhrsw m3, m23, [cq+64*11] 6173 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 6174 pmulhrsw m0, m23, [cq+64* 3] 6175 pmulhrsw m1, m23, [cq+64*29] 6176 pmulhrsw m2, m23, [cq+64*19] 6177 pmulhrsw m3, m23, [cq+64*13] 6178 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 6179 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 6180 pmulhrsw m3, m23, [cq+64*24] 6181 pmulhrsw m1, m23, [cq+64* 8] 6182 pmulhrsw m2, m23, [cq+64*16] 6183 pmulhrsw m0, m23, [cq+64* 0] 6184 pmulhrsw m14, m23, [cq+64* 4] 6185 pmulhrsw m17, m23, [cq+64*28] 6186 pmulhrsw m16, m23, [cq+64*20] 6187 pmulhrsw m15, m23, [cq+64*12] 6188 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 6189 pmulhrsw m22, m23, [cq+64* 2] 6190 pmulhrsw m29, m23, [cq+64*30] 6191 pmulhrsw m26, m23, [cq+64*18] 6192 pmulhrsw m25, m23, [cq+64*14] 6193 pmulhrsw m24, m23, [cq+64*10] 6194 pmulhrsw m27, m23, [cq+64*22] 6195 pmulhrsw m28, m23, [cq+64*26] 6196 pmulhrsw m23, [cq+64* 6] 6197 mova [cq+64* 0], m14 6198 mova [cq+64* 1], m15 6199 mova [cq+64* 2], m16 6200 mova [cq+64* 3], m17 6201 mova [cq+64* 4], m18 6202 mova [cq+64* 5], m19 6203 mova [cq+64* 6], m20 6204 mova [cq+64* 7], m21 6205 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 6206 vpbroadcastd m13, [o(pw_16384)] 6207 call .pass1_end_part1 6208 mova [cq+64*16], m1 6209 mova [cq+64*17], m3 6210 mova [cq+64*18], m5 6211 mova [cq+64*19], m7 6212 mova [cq+64*24], m23 6213 mova [cq+64*25], m25 6214 mova [cq+64*26], m27 6215 mova [cq+64*27], m29 6216 pmulhrsw m23, m13, m0 ; a0 6217 pmulhrsw m25, m13, m2 ; a2 6218 pmulhrsw m27, m13, m4 ; a4 6219 pmulhrsw m29, m13, m6 ; a6 6220 REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6 6221 call .pass1_end_part2 6222 mova [cq+64*20], m15 6223 mova [cq+64*21], m17 6224 mova [cq+64*22], m19 6225 mova [cq+64*23], m21 6226 mova [cq+64*28], m1 6227 mova [cq+64*29], m3 6228 mova [cq+64*30], m5 6229 mova [cq+64*31], m7 6230 REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6 6231 REPX {pmulhrsw x, m13}, m0, m2, m4, m6 ; g0 g2 g4 g6 6232 vinserti32x8 m3, m23, ym14, 1 ; a00 a01 c00 c01 6233 vshufi32x4 m23, m14, q3232 ; a02 a03 c02 c03 6234 vinserti32x8 m15, m22, ym0, 1 ; e00 e01 g00 g01 6235 vshufi32x4 m22, m0, q3232 ; e02 e03 g02 g03 6236 vinserti32x8 m1, m27, ym18, 1 ; a40 a41 c40 c41 6237 vshufi32x4 m27, m18, q3232 ; a42 a43 c42 c43 6238 vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 6239 vshufi32x4 m26, m4, q3232 ; e42 e43 g42 g43 6240 vinserti32x8 m14, m25, ym16, 1 ; a20 a21 c20 c21 6241 vshufi32x4 m25, m16, q3232 ; a22 a23 c22 c23 6242 vinserti32x8 m17, m24, ym2, 1 ; e20 e21 g20 g21 6243 vshufi32x4 m24, m2, q3232 ; e22 e23 g22 g23 6244 vinserti32x8 m19, m29, ym20, 1 ; a60 a61 c60 c61 6245 vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 6246 vinserti32x8 m20, m28, ym6, 1 ; e60 e61 g60 g61 6247 vshufi32x4 m28, m6, q3232 ; e62 e63 g62 g63 6248 vshufi32x4 m2, m3, m15, q3131 ; 8 6249 vshufi32x4 m0, m3, m15, q2020 ; 0 6250 vshufi32x4 m6, m23, m22, q3131 ; 24 6251 vshufi32x4 m4, m23, m22, q2020 ; 16 6252 vshufi32x4 m3, m1, m18, q3131 ; 12 6253 vshufi32x4 m1, m18, q2020 ; 4 6254 vshufi32x4 m7, m27, m26, q3131 ; 28 6255 vshufi32x4 m5, m27, m26, q2020 ; 20 6256 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 6257 vshufi32x4 m16, m14, m17, q3131 ; 10 6258 vshufi32x4 m14, m17, q2020 ; 2 6259 vshufi32x4 m17, m19, m20, q3131 ; 14 6260 vshufi32x4 m15, m19, m20, q2020 ; 6 6261 vshufi32x4 m20, m25, m24, q3131 ; 26 6262 vshufi32x4 m18, m25, m24, q2020 ; 18 6263 vshufi32x4 m21, m29, m28, q3131 ; 30 6264 vshufi32x4 m19, m29, m28, q2020 ; 22 6265 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 6266 pmulhrsw m22, m13, [cq+64*16] ; a1 6267 pmulhrsw m23, m13, [cq+64*20] ; c1 6268 pmulhrsw m24, m13, [cq+64*24] ; e1 6269 pmulhrsw m25, m13, [cq+64*28] ; g1 6270 pmulhrsw m26, m13, [cq+64*17] ; a3 6271 pmulhrsw m27, m13, [cq+64*21] ; c3 6272 pmulhrsw m28, m13, [cq+64*25] ; e3 6273 pmulhrsw m29, m13, [cq+64*29] ; g3 6274 mova [cq+64* 8], m14 6275 mova [cq+64* 9], m15 6276 mova [cq+64*10], m16 6277 mova [cq+64*11], m17 6278 mova [cq+64*12], m18 6279 mova [cq+64*13], m19 6280 mova [cq+64*14], m20 6281 mova [cq+64*15], m21 6282 pmulhrsw m14, m13, [cq+64*18] ; a5 6283 pmulhrsw m15, m13, [cq+64*22] ; c5 6284 pmulhrsw m16, m13, [cq+64*26] ; e5 6285 pmulhrsw m17, m13, [cq+64*30] ; g5 6286 pmulhrsw m18, m13, [cq+64*19] ; a7 6287 pmulhrsw m19, m13, [cq+64*23] ; c7 6288 pmulhrsw m20, m13, [cq+64*27] ; e7 6289 pmulhrsw m21, m13, [cq+64*31] ; g7 6290 vinserti32x8 m8, m22, ym23, 1 ; a10 a11 c10 c11 6291 vshufi32x4 m22, m23, q3232 ; a12 a13 c12 c13 6292 vinserti32x8 m9, m24, ym25, 1 ; e10 e11 g10 g11 6293 vshufi32x4 m24, m25, q3232 ; e12 e13 g12 g13 6294 vinserti32x8 m23, m26, ym27, 1 ; a30 a31 c30 c31 6295 vshufi32x4 m26, m27, q3232 ; a32 a33 c32 c33 6296 vinserti32x8 m11, m28, ym29, 1 ; e30 e31 g30 g31 6297 vshufi32x4 m28, m29, q3232 ; e32 e33 g32 g33 6298 mova [cq+64* 0], m0 6299 mova [cq+64* 1], m1 6300 mova [cq+64* 2], m2 6301 mova [cq+64* 3], m3 6302 mova [cq+64* 4], m4 6303 mova [cq+64* 5], m5 6304 mova [cq+64* 6], m6 6305 mova [cq+64* 7], m7 6306 vinserti32x8 m12, m14, ym15, 1 ; a50 a51 c50 c51 6307 vshufi32x4 m14, m15, q3232 ; a52 a53 c52 c53 6308 vinserti32x8 m13, m16, ym17, 1 ; e50 e51 g50 g51 6309 vshufi32x4 m16, m17, q3232 ; e52 e53 g52 g53 6310 vinserti32x8 m25, m18, ym19, 1 ; a70 a71 c70 c71 6311 vshufi32x4 m18, m19, q3232 ; a72 a73 c72 c73 6312 vinserti32x8 m17, m20, ym21, 1 ; e70 e71 g70 g71 6313 vshufi32x4 m20, m21, q3232 ; e72 e73 g72 g73 6314 vshufi32x4 m27, m23, m11, q3131 ; 11 m27 6315 vshufi32x4 m23, m11, q2020 ; 3 m23 6316 vshufi32x4 m19, m26, m28, q3131 ; 27 m19 6317 vshufi32x4 m15, m26, m28, q2020 ; 19 m15 6318 vshufi32x4 m29, m25, m17, q3131 ; 15 m29 6319 vshufi32x4 m25, m17, q2020 ; 7 m25 6320 vshufi32x4 m21, m18, m20, q3131 ; 31 m21 6321 vshufi32x4 m17, m18, m20, q2020 ; 23 m17 6322 vshufi32x4 m20, m14, m16, q3131 ; 29 m20 6323 vshufi32x4 m16, m14, m16, q2020 ; 21 m16 6324 vshufi32x4 m18, m22, m24, q3131 ; 25 m18 6325 vshufi32x4 m14, m22, m24, q2020 ; 17 m14 6326 vshufi32x4 m26, m8, m9, q3131 ; 9 m26 6327 vshufi32x4 m22, m8, m9, q2020 ; 1 m22 6328 vshufi32x4 m28, m12, m13, q3131 ; 13 m28 6329 vshufi32x4 m24, m12, m13, q2020 ; 5 m24 6330 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf 6331 vpbroadcastd m13, [o(pw_16384)] 6332 pmulhrsw m0, m13, [r4-64*21] 6333 pmulhrsw m1, m13, [r4-64*22] 6334 pmulhrsw m2, m13, [r4-64*23] 6335 pmulhrsw m3, m13, [r4-64*24] 6336 pmulhrsw m4, m13, [r4-64*25] 6337 pmulhrsw m5, m13, [r4-64*26] 6338 pmulhrsw m6, m13, [r4-64*27] 6339 pmulhrsw m7, m13, [r4-64*28] 6340 mova [cq+64*16], m14 6341 mova [cq+64*17], m15 6342 mova [cq+64*18], m16 6343 mova [cq+64*19], m17 6344 mova [cq+64*20], m18 6345 mova [cq+64*21], m19 6346 mova [cq+64*22], m20 6347 mova [cq+64*23], m21 6348 pmulhrsw m14, m13, [r4-64*12] 6349 pmulhrsw m15, m13, [r4-64*11] 6350 pmulhrsw m16, m13, [r4-64*10] 6351 pmulhrsw m17, m13, [r4-64* 9] 6352 pmulhrsw m18, m13, [r4-64* 8] 6353 pmulhrsw m19, m13, [r4-64* 7] 6354 pmulhrsw m20, m13, [r4-64* 6] 6355 pmulhrsw m21, m13, [r4-64* 5] 6356 mova [cq+64*24], m22 6357 mova [cq+64*25], m23 6358 mova [cq+64*26], m24 6359 mova [cq+64*27], m25 6360 mova [cq+64*28], m26 6361 mova [cq+64*29], m27 6362 mova [cq+64*30], m28 6363 mova [cq+64*31], m29 6364 call .transpose_2x8x8_lo 6365 mova [r4-64*12], m1 6366 mova [r4-64*11], m3 6367 mova [r4-64*10], m5 6368 mova [r4-64* 9], m7 6369 mova [r4-64* 8], m15 6370 mova [r4-64* 7], m17 6371 mova [r4-64* 6], m19 6372 mova [r4-64* 5], m21 6373 vinserti32x8 m22, m0, ym14, 1 ; f00 f01 h00 h01 6374 vshufi32x4 m23, m0, m14, q3232 ; f02 f03 h02 h03 6375 vinserti32x8 m24, m2, ym16, 1 ; f20 f21 h20 h21 6376 vshufi32x4 m25, m2, m16, q3232 ; f22 f23 h22 h23 6377 vinserti32x8 m26, m4, ym18, 1 ; f40 f41 h40 h41 6378 vshufi32x4 m27, m4, m18, q3232 ; f42 f43 h42 h43 6379 vinserti32x8 m28, m6, ym20, 1 ; f60 f61 h60 h61 6380 vshufi32x4 m29, m6, m20, q3232 ; f62 f63 h62 h63 6381 pmulhrsw m0, m13, [r4-64*20] 6382 pmulhrsw m1, m13, [r4-64*19] 6383 pmulhrsw m2, m13, [r4-64*18] 6384 pmulhrsw m3, m13, [r4-64*17] 6385 pmulhrsw m4, m13, [r4-64*16] 6386 pmulhrsw m5, m13, [r4-64*15] 6387 pmulhrsw m6, m13, [r4-64*14] 6388 pmulhrsw m7, m13, [r4-64*13] 6389 pmulhrsw m14, m13, [r4-64*29] 6390 pmulhrsw m15, m13, [r4-64*30] 6391 pmulhrsw m16, m13, [r4-64*31] 6392 pmulhrsw m17, m13, [r4-64*32] 6393 pmulhrsw m18, m13, [r4-64*33] 6394 pmulhrsw m19, m13, [r4-64*34] 6395 pmulhrsw m20, m13, [r4-64*35] 6396 pmulhrsw m21, m13, [r4-64*36] 6397 call .transpose_2x8x8_lo 6398 mova [r4-64*20], m1 6399 mova [r4-64*19], m3 6400 mova [r4-64*18], m5 6401 mova [r4-64*17], m7 6402 mova [r4-64*16], m15 6403 mova [r4-64*15], m17 6404 mova [r4-64*14], m19 6405 mova [r4-64*13], m21 6406 vinserti32x8 m1, m4, ym18, 1 ; b40 b41 d40 d41 6407 vshufi32x4 m5, m4, m18, q3232 ; b42 b43 d42 d43 6408 vshufi32x4 m4, m0, m14, q3232 ; b02 b03 d02 d03 6409 vinserti32x8 m0, ym14, 1 ; b00 b01 d00 d01 6410 vinserti32x8 m14, m2, ym16, 1 ; b20 b21 d20 d21 6411 vshufi32x4 m18, m2, m16, q3232 ; b22 b23 d22 d23 6412 vinserti32x8 m15, m6, ym20, 1 ; b60 b61 d60 d61 6413 vshufi32x4 m19, m6, m20, q3232 ; b62 b63 d62 d63 6414 vshufi32x4 m2, m0, m22, q3131 ; 8 6415 vshufi32x4 m0, m22, q2020 ; 0 6416 vshufi32x4 m3, m1, m26, q3131 ; 12 6417 vshufi32x4 m1, m26, q2020 ; 4 6418 vshufi32x4 m6, m4, m23, q3131 ; 24 6419 vshufi32x4 m4, m23, q2020 ; 16 6420 vshufi32x4 m7, m5, m27, q3131 ; 28 6421 vshufi32x4 m5, m27, q2020 ; 20 6422 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 6423 vshufi32x4 m16, m14, m24, q3131 ; 10 6424 vshufi32x4 m14, m24, q2020 ; 2 6425 vshufi32x4 m17, m15, m28, q3131 ; 14 6426 vshufi32x4 m15, m28, q2020 ; 6 6427 vshufi32x4 m20, m18, m25, q3131 ; 26 6428 vshufi32x4 m18, m25, q2020 ; 18 6429 vshufi32x4 m21, m19, m29, q3131 ; 30 6430 vshufi32x4 m19, m29, q2020 ; 22 6431 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 6432 mova m22, [r4-64*20] 6433 mova m26, [r4-64*16] 6434 mova m23, [r4-64*19] 6435 mova m27, [r4-64*15] 6436 mova m24, [r4-64*18] 6437 mova m28, [r4-64*14] 6438 mova m25, [r4-64*17] 6439 mova m29, [r4-64*13] 6440 mova [r4-64*20], m14 6441 mova [r4-64*19], m15 6442 mova [r4-64*18], m16 6443 mova [r4-64*17], m17 6444 mova [r4-64*16], m18 6445 mova [r4-64*15], m19 6446 mova [r4-64*14], m20 6447 mova [r4-64*13], m21 6448 mova m19, [r4-64*12] 6449 mova m11, [r4-64* 8] 6450 mova m20, [r4-64*11] 6451 mova m12, [r4-64* 7] 6452 mova m21, [r4-64*10] 6453 mova m8, [r4-64* 6] 6454 mova m9, [r4-64* 9] 6455 mova m18, [r4-64* 5] 6456 vshufi32x4 m14, m22, m26, q3232 ; b12 b13 d12 d13 6457 vinserti32x8 m22, ym26, 1 ; b10 b11 d10 d11 6458 vshufi32x4 m15, m23, m27, q3232 ; b32 b33 d32 d33 6459 vinserti32x8 m23, ym27, 1 ; b30 b31 d30 d31 6460 vshufi32x4 m16, m24, m28, q3232 ; b52 b53 d52 d53 6461 vinserti32x8 m24, ym28, 1 ; b50 b51 d50 d51 6462 vshufi32x4 m17, m25, m29, q3232 ; b72 b73 d72 d73 6463 vinserti32x8 m25, ym29, 1 ; b70 b71 d70 d71 6464 vinserti32x8 m27, m19, ym11, 1 ; f10 f11 h10 h11 6465 vshufi32x4 m19, m11, q3232 ; f12 f13 h12 h13 6466 vinserti32x8 m28, m20, ym12, 1 ; f30 f31 h30 h31 6467 vshufi32x4 m20, m12, q3232 ; f32 f33 h32 h33 6468 vinserti32x8 m29, m21, ym8, 1 ; f50 f51 h50 h51 6469 vshufi32x4 m21, m8, q3232 ; f52 f53 h52 h53 6470 vinserti32x8 m8, m9, ym18, 1 ; f70 f71 h70 h71 6471 vshufi32x4 m9, m18, q3232 ; f72 f73 h72 h73 6472 vshufi32x4 m26, m22, m27, q3131 ; 9 6473 vshufi32x4 m22, m27, q2020 ; 1 6474 vshufi32x4 m27, m23, m28, q3131 ; 11 6475 vshufi32x4 m23, m28, q2020 ; 3 6476 vshufi32x4 m28, m24, m29, q3131 ; 13 6477 vshufi32x4 m24, m29, q2020 ; 5 6478 vshufi32x4 m29, m25, m8, q3131 ; 15 6479 vshufi32x4 m25, m8, q2020 ; 7 6480 vshufi32x4 m18, m14, m19, q3131 ; 25 6481 vshufi32x4 m14, m19, q2020 ; 17 6482 vshufi32x4 m19, m15, m20, q3131 ; 27 6483 vshufi32x4 m15, m20, q2020 ; 19 6484 vshufi32x4 m20, m16, m21, q3131 ; 29 6485 vshufi32x4 m16, m21, q2020 ; 21 6486 vshufi32x4 m21, m17, m9, q3131 ; 31 6487 vshufi32x4 m17, m9, q2020 ; 23 6488 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf 6489 jmp .end 6490.fast: ; bottom/right halves are zero 6491 {evex}vpmulhrsw ym8, ym23, [cq+64* 4] 6492 {evex}vpmulhrsw xm1, xm23, [cq+64*12] 6493 mova m28, [o(dup16_perm)] 6494 {evex}vpmulhrsw ym7, ym23, [cq+64* 8] 6495 vpmulhrsw ym22, ym23, [cq+64* 0] 6496 vpermb m8, m28, m8 6497 vpermb ym1, ym28, ym1 6498 vpermb m7, m28, m7 6499 pmovzxwd m9, ym22 6500 pslld m9, 16 6501 call m(idct_16x16_internal_8bpc).main_fast2 6502 {evex}vpmulhrsw ym21, ym23, [cq+64* 2] 6503 {evex}vpmulhrsw xm15, xm23, [cq+64*14] 6504 {evex}vpmulhrsw xm18, xm23, [cq+64*10] 6505 {evex}vpmulhrsw ym14, ym23, [cq+64* 6] 6506 vpermb m21, m28, m21 6507 punpcklwd xm15, xm15 6508 vpermb ym18, ym28, ym18 6509 vpermb m14, m28, m14 6510 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 6511 vpmulhrsw ym22, ym23, [cq+64* 1] 6512 {evex}vpmulhrsw xm29, xm23, [cq+64*15] 6513 {evex}vpmulhrsw xm26, xm23, [cq+64* 9] 6514 {evex}vpmulhrsw ym25, ym23, [cq+64* 7] 6515 {evex}vpmulhrsw ym24, ym23, [cq+64* 5] 6516 {evex}vpmulhrsw xm27, xm23, [cq+64*11] 6517 {evex}vpmulhrsw xm8, xm23, [cq+64*13] 6518 {evex}vpmulhrsw ym23, [cq+64* 3] 6519 vpermb m22, m28, m22 6520 punpcklwd xm29, xm29 6521 vpermb ym26, ym28, ym26 6522 vpermb m25, m28, m25 6523 mova [cq+64* 0], m14 6524 mova [cq+64* 1], m15 6525 mova [cq+64* 2], m16 6526 mova [cq+64* 3], m17 6527 REPX {vpermb x, m28, x}, m24, m27, m23 6528 punpcklwd xm28, xm8, xm8 6529 mova [cq+64* 4], m18 6530 mova [cq+64* 5], m19 6531 mova [cq+64* 6], m20 6532 mova [cq+64* 7], m21 6533 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast 6534 mov r4, rsp 6535 vpbroadcastd m13, [o(pw_16384)] 6536 mova [r4+64*16], m4 6537 mova [r4+64*17], m5 6538 mova [r4+64*18], m6 6539 mova [r4+64*19], m7 6540 mova [r4+64*28], m26 6541 mova [r4+64*29], m27 6542 mova [r4+64*30], m28 6543 mova [r4+64*31], m29 6544 call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end 6545 mova [r4+64*20], m22 6546 mova [r4+64*21], m23 6547 mova [r4+64*22], m24 6548 mova [r4+64*23], m25 6549 mova [r4+64*24], m26 6550 mova [r4+64*25], m27 6551 mova [r4+64*26], m28 6552 mova [r4+64*27], m29 6553 call .pass2_fast 6554 mova [cq+64* 8], m14 6555 mova [cq+64* 9], m15 6556 mova [cq+64*10], m16 6557 mova [cq+64*11], m17 6558 mova [cq+64*12], m18 6559 mova [cq+64*13], m19 6560 mova [cq+64*14], m20 6561 mova [cq+64*15], m21 6562 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 6563 mova [cq+64* 0], m0 6564 mova [cq+64* 1], m1 6565 mova [cq+64* 2], m2 6566 mova [cq+64* 3], m3 6567 mova [cq+64* 4], m4 6568 mova [cq+64* 5], m5 6569 mova [cq+64* 6], m6 6570 mova [cq+64* 7], m7 6571 pmulhrsw m0, m13, [r4+64*16] 6572 pmulhrsw m1, m13, [r4+64*17] 6573 pmulhrsw m2, m13, [r4+64*18] 6574 pmulhrsw m3, m13, [r4+64*19] 6575 pmulhrsw m4, m13, [r4+64*20] 6576 pmulhrsw m5, m13, [r4+64*21] 6577 pmulhrsw m6, m13, [r4+64*22] 6578 pmulhrsw m7, m13, [r4+64*23] 6579 mova [cq+64*16], m14 6580 mova [cq+64*17], m15 6581 mova [cq+64*18], m16 6582 mova [cq+64*19], m17 6583 mova [cq+64*20], m18 6584 mova [cq+64*21], m19 6585 mova [cq+64*22], m20 6586 mova [cq+64*23], m21 6587 pmulhrsw m14, m13, [r4+64*24] 6588 pmulhrsw m15, m13, [r4+64*25] 6589 pmulhrsw m16, m13, [r4+64*26] 6590 pmulhrsw m17, m13, [r4+64*27] 6591 pmulhrsw m18, m13, [r4+64*28] 6592 pmulhrsw m19, m13, [r4+64*29] 6593 pmulhrsw m20, m13, [r4+64*30] 6594 pmulhrsw m21, m13, [r4+64*31] 6595 mova [cq+64*24], m22 6596 mova [cq+64*25], m23 6597 mova [cq+64*26], m24 6598 mova [cq+64*27], m25 6599 mova [cq+64*28], m26 6600 mova [cq+64*29], m27 6601 mova [cq+64*30], m28 6602 mova [cq+64*31], m29 6603 call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round 6604 call .pass2_fast 6605 mova [r4+64*16], m14 6606 mova [r4+64*17], m15 6607 mova [r4+64*18], m16 6608 mova [r4+64*19], m17 6609 mova [r4+64*20], m18 6610 mova [r4+64*21], m19 6611 mova [r4+64*22], m20 6612 mova [r4+64*23], m21 6613 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 6614.end: 6615 vpbroadcastd m13, [o(pw_2048)] 6616 lea r5, [strideq*3] 6617 pxor m12, m12 6618 lea r3, [dstq+r5*8] 6619 lea r6, [strideq+r5] ; stride*4 6620 add r3, r6 ; dst+stride*28 6621%macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi 6622 mova m11, [cq+64*( %3)] ; 0 6623 mova m9, [cq+64*(31-%3)] ; 31 6624%if %3 >= 8 6625 mova m%1, [rsp+64*(%1+16)] 6626%endif 6627 mova m10, [dstq+%4] 6628 paddsw m8, m11, m9 6629 psubsw m11, m9 6630 paddsw m9, m%1, m%2 6631 psubsw m%1, m%2 6632 punpcklbw m%2, m10, m12 6633 punpckhbw m10, m12 6634 pmulhrsw m8, m13 6635 pmulhrsw m9, m13 6636 paddw m8, m%2 6637 paddw m9, m10 6638 mova m10, [r3+%5] 6639 pmulhrsw m11, m13 6640 pmulhrsw m%1, m13 6641 mova [cq+64*( %3)], m12 6642 mova [cq+64*(31-%3)], m12 6643 punpcklbw m%2, m10, m12 6644 punpckhbw m10, m12 6645 packuswb m8, m9 6646 paddw m11, m%2 6647 paddw m%1, m10 6648 packuswb m11, m%1 6649 mova [dstq+%4], m8 6650 mova [r3 +%5], m11 6651%if %3 == 3 || %3 == 7 || %3 == 11 6652 add dstq, r6 6653 sub r3, r6 6654%endif 6655%endmacro 6656 IDCT_64x32_END 0, 29, 0, strideq*0, r5 6657 IDCT_64x32_END 1, 28, 1, strideq*1, strideq*2 6658 IDCT_64x32_END 2, 27, 2, strideq*2, strideq*1 6659 IDCT_64x32_END 3, 26, 3, r5 , strideq*0 6660 IDCT_64x32_END 4, 25, 4, strideq*0, r5 6661 IDCT_64x32_END 5, 24, 5, strideq*1, strideq*2 6662 IDCT_64x32_END 6, 23, 6, strideq*2, strideq*1 6663 IDCT_64x32_END 7, 22, 7, r5 , strideq*0 6664 IDCT_64x32_END 0, 21, 8, strideq*0, r5 6665 IDCT_64x32_END 1, 20, 9, strideq*1, strideq*2 6666 IDCT_64x32_END 2, 19, 10, strideq*2, strideq*1 6667 IDCT_64x32_END 3, 18, 11, r5 , strideq*0 6668 IDCT_64x32_END 4, 17, 12, strideq*0, r5 6669 IDCT_64x32_END 5, 16, 13, strideq*1, strideq*2 6670 IDCT_64x32_END 6, 15, 14, strideq*2, strideq*1 6671 IDCT_64x32_END 7, 14, 15, r5 , strideq*0 6672 RET 6673ALIGN function_align 6674.dconly: 6675 movsx r6d, word [cq] 6676 mov [cq], eobd 6677 imul r6d, 181 6678 mov r3d, 32 6679 add r6d, 128 6680 sar r6d, 8 6681 imul r6d, 181 6682 add r6d, 128+256 6683 sar r6d, 8+1 6684 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2 6685ALIGN function_align 6686.pass1_end_part1: 6687%macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64 6688%if %1 != %3 6689 mova m%1, [cq+64*%1] 6690%endif 6691 mova m9, [r4+64*(%3-36)] ; idct64 32+n 6692 mova m11, [r4+64*(-5-%3)] ; idct64 63-n 6693 psubsw m8, m%1, m%2 ; idct32 31-n 6694 paddsw m%1, m%2 ; idct32 0+n 6695%if %1 == %3 6696 psubsw m%2, m8, m9 ; out 32+n e 6697 paddsw m8, m9 ; out 31-n d 6698 psubsw m9, m%1, m11 ; out 63-n h 6699 paddsw m%1, m11 ; out 0+n a 6700%else 6701 paddsw m%2, m8, m9 ; out 23-n c 6702 psubsw m8, m9 ; out 40+n f 6703 paddsw m9, m%1, m11 ; out 8+n b 6704 psubsw m%1, m11 ; out 55-n g 6705%endif 6706 mova [r4+64*(%3-36)], m8 6707 mova [r4+64*(-5-%3)], m9 6708%endmacro 6709 IDCT_64x32_PASS1_END 0, 29, 0 6710 IDCT_64x32_PASS1_END 1, 28, 1 6711 IDCT_64x32_PASS1_END 2, 27, 2 6712 IDCT_64x32_PASS1_END 3, 26, 3 6713 IDCT_64x32_PASS1_END 4, 25, 4 6714 IDCT_64x32_PASS1_END 5, 24, 5 6715 IDCT_64x32_PASS1_END 6, 23, 6 6716 IDCT_64x32_PASS1_END 7, 22, 7 6717.transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted) 6718 punpcklwd m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3 6719 punpckhwd m25, m24 ; e4 f4 e5 f5 e6 f6 e7 f7 6720 punpcklwd m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3 6721 punpckhwd m23, m22 ; g4 h4 g5 h5 g6 h6 g7 h7 6722 punpcklwd m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3 6723 punpckhwd m29, m28 ; a4 b4 a5 b5 a6 b6 a7 b7 6724 punpcklwd m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3 6725 punpckhwd m27, m26 ; c4 d4 c5 d5 c6 d6 c7 d7 6726 punpckldq m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5 6727 punpckhdq m29, m27 ; a6 b6 c6 d6 a7 b7 c7 d7 6728 punpckldq m27, m8, m24 ; e0 f0 g0 h0 e1 f1 g1 h1 6729 punpckhdq m8, m24 ; e2 f2 g2 h2 e3 f3 g3 h3 6730 punpckhdq m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3 6731 punpckldq m22, m28 ; a0 b0 c0 d0 a1 b1 c1 d1 6732 punpckldq m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5 6733 punpckhdq m25, m23 ; e6 f6 g6 h6 e7 f7 g7 h7 6734 punpckhqdq m23, m22, m27 ; 1 23 6735 punpcklqdq m22, m27 ; 0 22 6736 punpckhqdq m27, m26, m28 ; 5 27 6737 punpcklqdq m26, m28 ; 4 26 6738 punpcklqdq m28, m29, m25 ; 6 28 6739 punpckhqdq m29, m25 ; 7 29 6740 punpckhqdq m25, m24, m8 ; 3 25 6741 punpcklqdq m24, m8 ; 2 24 6742.transpose_8x8: 6743 punpckhwd m8, m4, m5 6744 punpcklwd m4, m5 6745 punpckhwd m5, m0, m1 6746 punpcklwd m0, m1 6747 punpckhwd m1, m6, m7 6748 punpcklwd m6, m7 6749 punpckhwd m7, m2, m3 6750 punpcklwd m2, m3 6751 punpckhdq m3, m0, m2 6752 punpckldq m0, m2 6753 punpckldq m2, m4, m6 6754 punpckhdq m4, m6 6755 punpckhdq m6, m5, m7 6756 punpckldq m5, m7 6757 punpckldq m7, m8, m1 6758 punpckhdq m8, m1 6759 punpckhqdq m1, m0, m2 6760 punpcklqdq m0, m2 6761 punpcklqdq m2, m3, m4 6762 punpckhqdq m3, m4 6763 punpcklqdq m4, m5, m7 6764 punpckhqdq m5, m7 6765 punpckhqdq m7, m6, m8 6766 punpcklqdq m6, m8 6767 ret 6768.pass1_end_part2: 6769 IDCT_64x32_PASS1_END 0, 21, 8 6770 IDCT_64x32_PASS1_END 1, 20, 9 6771 IDCT_64x32_PASS1_END 2, 19, 10 6772 IDCT_64x32_PASS1_END 3, 18, 11 6773 IDCT_64x32_PASS1_END 4, 17, 12 6774 IDCT_64x32_PASS1_END 5, 16, 13 6775 IDCT_64x32_PASS1_END 6, 15, 14 6776 IDCT_64x32_PASS1_END 7, 14, 15 6777.transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21 6778 punpcklwd m8, m3, m2 6779 punpckhwd m3, m2 6780 punpcklwd m2, m1, m0 6781 punpckhwd m1, m0 6782 punpcklwd m0, m7, m6 6783 punpckhwd m7, m6 6784 punpcklwd m6, m5, m4 6785 punpckhwd m5, m4 6786 punpckldq m4, m7, m5 6787 punpckhdq m7, m5 6788 punpckldq m5, m8, m2 6789 punpckhdq m8, m2 6790 punpckhdq m2, m0, m6 6791 punpckldq m0, m6 6792 punpckldq m6, m3, m1 6793 punpckhdq m3, m1 6794 punpckhqdq m1, m0, m5 6795 punpcklqdq m0, m5 6796 punpckhqdq m5, m4, m6 6797 punpcklqdq m4, m6 6798 punpcklqdq m6, m7, m3 6799 punpckhqdq m7, m3 6800 punpckhqdq m3, m2, m8 6801 punpcklqdq m2, m8 6802 punpckhwd m8, m18, m19 6803 punpcklwd m18, m19 6804 punpckhwd m19, m14, m15 6805 punpcklwd m14, m15 6806 punpckhwd m15, m20, m21 6807 punpcklwd m20, m21 6808 punpckhwd m21, m16, m17 6809 punpcklwd m16, m17 6810 punpckhdq m17, m14, m16 6811 punpckldq m14, m16 6812 punpckldq m16, m18, m20 6813 punpckhdq m18, m20 6814 punpckhdq m20, m19, m21 6815 punpckldq m19, m21 6816 punpckldq m21, m8, m15 6817 punpckhdq m8, m15 6818 punpckhqdq m15, m14, m16 6819 punpcklqdq m14, m16 6820 punpcklqdq m16, m17, m18 6821 punpckhqdq m17, m18 6822 punpcklqdq m18, m19, m21 6823 punpckhqdq m19, m21 6824 punpckhqdq m21, m20, m8 6825 punpcklqdq m20, m8 6826 ret 6827.pass2_fast: 6828 vshufi32x4 m24, m9, m15, q3131 ; 5 6829 vshufi32x4 m22, m9, m15, q2020 ; 1 6830 vshufi32x4 m15, m1, m16, q3131 ; 6 6831 vshufi32x4 m14, m1, m16, q2020 ; 2 6832 vshufi32x4 m1, m0, m3, q3131 ; 4 6833 vshufi32x4 m0, m3, q2020 ; 0 6834 vshufi32x4 m3, m8, m2, q3131 ; 12 6835 vshufi32x4 m2, m8, m2, q2020 ; 8 6836 vshufi32x4 m25, m11, m17, q3131 ; 7 6837 vshufi32x4 m23, m11, m17, q2020 ; 3 6838 vshufi32x4 m17, m5, m19, q3131 ; 14 6839 vshufi32x4 m16, m5, m19, q2020 ; 10 6840 vshufi32x4 m29, m6, m20, q3131 ; 15 6841 vshufi32x4 m27, m6, m20, q2020 ; 11 6842 vshufi32x4 m28, m4, m18, q3131 ; 13 6843 vshufi32x4 m26, m4, m18, q2020 ; 9 6844 jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 6845 6846cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob 6847 lea r5, [o_base] 6848 test eobd, eobd 6849 jz .dconly 6850 PROLOGUE 0, 7, 30, 64*96, dst, stride, c, eob 6851%undef cmp 6852 cmp eobd, 136 6853 jb .fast 6854 mova m0, [cq+64* 1] 6855 mova m1, [cq+64*31] 6856 mova m2, [cq+64*17] 6857 mova m3, [cq+64*15] 6858 vpbroadcastd m10, [o(pd_2048)] 6859 mov r4, rsp 6860 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 6861 mova m0, [cq+64* 7] 6862 mova m1, [cq+64*25] 6863 mova m2, [cq+64*23] 6864 mova m3, [cq+64* 9] 6865 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 6866 mova m0, [cq+64* 5] 6867 mova m1, [cq+64*27] 6868 mova m2, [cq+64*21] 6869 mova m3, [cq+64*11] 6870 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 6871 mova m0, [cq+64* 3] 6872 mova m1, [cq+64*29] 6873 mova m2, [cq+64*19] 6874 mova m3, [cq+64*13] 6875 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 6876 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 6877 mova m0, [cq+64* 0] 6878 mova m1, [cq+64* 8] 6879 mova m2, [cq+64*16] 6880 mova m3, [cq+64*24] 6881 mova m14, [cq+64* 4] 6882 mova m15, [cq+64*12] 6883 mova m16, [cq+64*20] 6884 mova m17, [cq+64*28] 6885 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 6886 mova m22, [cq+64* 2] 6887 mova m29, [cq+64*30] 6888 mova m26, [cq+64*18] 6889 mova m25, [cq+64*14] 6890 mova m24, [cq+64*10] 6891 mova m27, [cq+64*22] 6892 mova m28, [cq+64*26] 6893 mova m23, [cq+64* 6] 6894 mova [cq+64* 0], m14 6895 mova [cq+64* 1], m15 6896 mova [cq+64* 2], m16 6897 mova [cq+64* 3], m17 6898 mova [cq+64* 4], m18 6899 mova [cq+64* 5], m19 6900 mova [cq+64* 6], m20 6901 mova [cq+64* 7], m21 6902 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 6903 vpbroadcastd m13, [o(pw_8192)] 6904 call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1 6905 mova [r4+64*36], m1 6906 mova [r4+64*37], m3 6907 mova [r4+64*38], m5 6908 mova [r4+64*39], m7 6909 mova [r4+64*44], m23 6910 mova [r4+64*45], m25 6911 mova [r4+64*46], m27 6912 mova [r4+64*47], m29 6913 pmulhrsw m23, m13, m0 ; a0 6914 pmulhrsw m25, m13, m2 ; a2 6915 pmulhrsw m27, m13, m4 ; a4 6916 pmulhrsw m29, m13, m6 ; a6 6917 call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2 6918 lea r6, [r4-64*4] 6919 add r4, 64*28 6920 call .pass2_end 6921 mov r4, rsp 6922 mova m0, [r4+64*23] 6923 mova m1, [r4+64*22] 6924 mova m2, [r4+64*21] 6925 mova m3, [r4+64*20] 6926 mova m4, [r4+64*19] 6927 mova m5, [r4+64*18] 6928 mova m6, [r4+64*17] 6929 mova m7, [r4+64*16] 6930 mova m22, [r4+64*15] 6931 mova m23, [r4+64*14] 6932 mova m24, [r4+64*13] 6933 mova m25, [r4+64*12] 6934 mova m26, [r4+64*11] 6935 mova m27, [r4+64*10] 6936 mova m28, [r4+64* 9] 6937 mova m29, [r4+64* 8] 6938 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi 6939 vpbroadcastd m13, [o(pw_8192)] 6940 mova [r4+64* 8], m1 6941 mova [r4+64* 9], m3 6942 mova [r4+64*10], m5 6943 mova [r4+64*11], m7 6944 mova [r4+64*16], m23 6945 mova [r4+64*17], m25 6946 mova [r4+64*18], m27 6947 mova [r4+64*19], m29 6948 pmulhrsw m23, m13, m0 ; b0 6949 pmulhrsw m25, m13, m2 ; b2 6950 pmulhrsw m27, m13, m4 ; b4 6951 pmulhrsw m29, m13, m6 ; b6 6952 mova m0, [r4+64*31] 6953 mova m1, [r4+64*30] 6954 mova m2, [r4+64*29] 6955 mova m3, [r4+64*28] 6956 mova m4, [r4+64*27] 6957 mova m5, [r4+64*26] 6958 mova m6, [r4+64*25] 6959 mova m7, [r4+64*24] 6960 mova m14, [r4+64* 7] 6961 mova m15, [r4+64* 6] 6962 mova m16, [r4+64* 5] 6963 mova m17, [r4+64* 4] 6964 mova m18, [r4+64* 3] 6965 mova m19, [r4+64* 2] 6966 mova m20, [r4+64* 1] 6967 mova m21, [r4+64* 0] 6968 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo 6969 mov r6, cq 6970 call .pass2_end 6971 jmp .end 6972.fast: ; bottom/right halves are zero 6973 mova m28, [o(dup16_perm)] 6974 pmovzxwd m9, [cq+64* 0] 6975 vpermb m8, m28, [cq+64* 4] 6976 vpermb ym1, ym28, [cq+64*12] 6977 vpermb m7, m28, [cq+64* 8] 6978 pslld m9, 16 6979 call m(idct_16x16_internal_8bpc).main_fast2 6980 vpermb m21, m28, [cq+64* 2] 6981 vpermb ym15, ym28, [cq+64*14] 6982 vpermb ym18, ym28, [cq+64*10] 6983 vpermb m14, m28, [cq+64* 6] 6984 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 6985 vpermb m22, m28, [cq+64* 1] 6986 vpermb ym29, ym28, [cq+64*15] 6987 vpermb ym26, ym28, [cq+64* 9] 6988 vpermb m25, m28, [cq+64* 7] 6989 vpermb m24, m28, [cq+64* 5] 6990 vpermb ym27, ym28, [cq+64*11] 6991 vpermb m23, m28, [cq+64* 3] 6992 vpermb ym28, ym28, [cq+64*13] 6993 mova [cq+64* 0], m14 6994 mova [cq+64* 1], m15 6995 mova [cq+64* 2], m16 6996 mova [cq+64* 3], m17 6997 mova [cq+64* 4], m18 6998 mova [cq+64* 5], m19 6999 mova [cq+64* 6], m20 7000 mova [cq+64* 7], m21 7001 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast 7002 vpbroadcastd m13, [o(pw_8192)] 7003 mova [cq+64*16], m4 7004 mova [cq+64*17], m5 7005 mova [cq+64*18], m6 7006 mova [cq+64*19], m7 7007 mova [cq+64*28], m26 7008 mova [cq+64*29], m27 7009 mova [cq+64*30], m28 7010 mova [cq+64*31], m29 7011 call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end 7012 mova [cq+64*20], m22 7013 mova [cq+64*21], m23 7014 mova [cq+64*22], m24 7015 mova [cq+64*23], m25 7016 mova [cq+64*24], m26 7017 mova [cq+64*25], m27 7018 mova [cq+64*26], m28 7019 mova [cq+64*27], m29 7020 lea r4, [rsp+64*64] 7021 lea r3, [rsp+64*32] 7022 call .pass2_fast 7023 pmulhrsw m0, m13, [cq+64*16] 7024 pmulhrsw m1, m13, [cq+64*17] 7025 pmulhrsw m2, m13, [cq+64*18] 7026 pmulhrsw m3, m13, [cq+64*19] 7027 pmulhrsw m4, m13, [cq+64*20] 7028 pmulhrsw m5, m13, [cq+64*21] 7029 pmulhrsw m6, m13, [cq+64*22] 7030 pmulhrsw m7, m13, [cq+64*23] 7031 pmulhrsw m14, m13, [cq+64*24] 7032 pmulhrsw m15, m13, [cq+64*25] 7033 pmulhrsw m16, m13, [cq+64*26] 7034 pmulhrsw m17, m13, [cq+64*27] 7035 pmulhrsw m18, m13, [cq+64*28] 7036 pmulhrsw m19, m13, [cq+64*29] 7037 pmulhrsw m20, m13, [cq+64*30] 7038 pmulhrsw m21, m13, [cq+64*31] 7039 call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round 7040 mov r4, rsp 7041 mov r3, cq 7042 call .pass2_fast 7043.end: 7044 vpbroadcastd m17, [o(pw_2048)] 7045 lea r5, [strideq*8] 7046 mov r3, dstq 7047 pxor m16, m16 7048 sub r4, 64*5 ; rsp+64*31 7049 mov r6, rsp 7050.end_loop: 7051 mova m2, [r6+64*32] ; idct16 0+n lo 7052 mova m7, [r6+64*48] ; idct32 31-n lo 7053 mova m6, [cq+64* 0] ; idct16 0+n hi 7054 mova m0, [cq+64*16] ; idct32 31-n hi 7055 mova m4, [r4+64*64] ; idct64 63-n lo 7056 mova m1, [r4+64* 0] ; idct64 63-n hi 7057 mova m5, [r6+64*64] ; idct64 32+n lo 7058 mova m8, [r6+64* 0] ; idct64 32+n hi 7059 sub r3, strideq 7060 paddsw m3, m2, m7 ; idct32 0+n lo 7061 mova m12, [dstq+r5*0] 7062 psubsw m2, m7 ; idct32 31-n lo 7063 mova m15, [r3 +r5*8] 7064 paddsw m7, m6, m0 ; idct32 0+n hi 7065 mova m13, [r3 +r5*4] 7066 psubsw m6, m0 ; idct32 31-n hi 7067 mova m14, [dstq+r5*4] 7068 paddsw m0, m3, m4 ; out 0+n lo 7069 add r6, 64 7070 psubsw m3, m4 ; out 63-n lo 7071 sub r4, 64 7072 paddsw m4, m7, m1 ; out 0+n hi 7073 mova [cq+64* 0], m16 7074 psubsw m7, m1 ; out 63-n hi 7075 mova [cq+64*16], m16 7076 paddsw m1, m2, m5 ; out 31-n lo 7077 add cq, 64 7078 psubsw m2, m5 ; out 32+n lo 7079 paddsw m5, m6, m8 ; out 31-n hi 7080 psubsw m6, m8 ; out 32+n hi 7081 pmulhrsw m0, m17 7082 punpcklbw m8, m12, m16 7083 pmulhrsw m4, m17 7084 punpckhbw m12, m16 7085 pmulhrsw m3, m17 7086 punpcklbw m11, m15, m16 7087 pmulhrsw m7, m17 7088 punpckhbw m15, m16 7089 pmulhrsw m1, m17 7090 punpcklbw m9, m13, m16 7091 pmulhrsw m5, m17 7092 punpckhbw m13, m16 7093 pmulhrsw m2, m17 7094 punpcklbw m10, m14, m16 7095 pmulhrsw m6, m17 7096 punpckhbw m14, m16 7097 paddw m0, m8 7098 paddw m4, m12 7099 packuswb m0, m4 7100 paddw m3, m11 7101 paddw m7, m15 7102 packuswb m3, m7 7103 paddw m1, m9 7104 paddw m5, m13 7105 packuswb m1, m5 7106 paddw m2, m10 7107 paddw m6, m14 7108 packuswb m2, m6 7109 mova [dstq+r5*0], m0 7110 mova [r3 +r5*8], m3 7111 mova [r3 +r5*4], m1 7112 mova [dstq+r5*4], m2 7113 add dstq, strideq 7114 cmp r6, r4 7115 jb .end_loop 7116 RET 7117.dconly: 7118 movsx r6d, word [cq] 7119 mov [cq], eobd 7120 mov r3d, 64 7121 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly 7122ALIGN function_align 7123.pass2_end: 7124 REPX {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6 7125 mova [r4+64*20], m1 7126 mova [r4+64*21], m3 7127 mova [r4+64*22], m5 7128 mova [r4+64*23], m7 7129 vinserti32x8 m1, m23, ym14, 1 ; a00 a01 c00 c01 7130 vshufi32x4 m3, m23, m14, q3232 ; a02 a03 c02 c03 7131 vinserti32x8 m5, m22, ym0, 1 ; e00 e01 g00 g01 7132 vshufi32x4 m14, m22, m0, q3232 ; e02 e03 g02 g03 7133 mova [r4+64*12], m15 7134 mova [r4+64*13], m17 7135 mova [r4+64*14], m19 7136 mova [r4+64*15], m21 7137 vinserti32x8 m15, m27, ym18, 1 ; a40 a41 c40 c41 7138 vshufi32x4 m17, m27, m18, q3232 ; a42 a43 c42 c43 7139 vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 7140 vshufi32x4 m19, m26, m4, q3232 ; e42 e43 g42 g43 7141 vinserti32x8 m22, m25, ym16, 1 ; a20 a21 c20 c21 7142 vshufi32x4 m26, m25, m16, q3232 ; a22 a23 c22 c23 7143 vinserti32x8 m25, m24, ym2, 1 ; e20 e21 g20 g21 7144 vshufi32x4 m27, m24, m2, q3232 ; e22 e23 g22 g23 7145 vinserti32x8 m23, m29, ym20, 1 ; a60 a61 c60 c61 7146 vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 7147 vshufi32x4 m13, m28, m6, q3232 ; e62 e63 g62 g63 7148 vinserti32x8 m28, ym6, 1 ; e60 e61 g60 g61 7149 vshufi32x4 m0, m1, m5, q2020 ; 0 7150 vshufi32x4 m1, m5, q3131 ; 8 7151 vshufi32x4 m2, m3, m14, q2020 ; 16 7152 vshufi32x4 m3, m14, q3131 ; 24 7153 vshufi32x4 m14, m15, m18, q2020 ; 4 7154 vshufi32x4 m15, m18, q3131 ; 12 7155 vshufi32x4 m16, m17, m19, q2020 ; 20 7156 vshufi32x4 m17, m19, q3131 ; 28 7157 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 7158 vshufi32x4 m24, m22, m25, q3131 ; 10 7159 vshufi32x4 m22, m25, q2020 ; 2 7160 vshufi32x4 m25, m23, m28, q3131 ; 14 7161 vshufi32x4 m23, m28, q2020 ; 6 7162 vshufi32x4 m28, m26, m27, q3131 ; 26 7163 vshufi32x4 m26, m27, q2020 ; 18 7164 vshufi32x4 m27, m29, m13, q2020 ; 22 7165 vshufi32x4 m29, m13, q3131 ; 30 7166 mova [r6+64* 0], m0 7167 mova [r6+64* 1], m1 7168 mova [r6+64* 2], m2 7169 mova [r6+64* 3], m3 7170 mova [r6+64* 4], m4 7171 mova [r6+64* 5], m5 7172 mova [r6+64* 6], m6 7173 mova [r6+64* 7], m7 7174 mova [r6+64* 8], m14 7175 mova [r6+64* 9], m15 7176 mova [r6+64*10], m16 7177 mova [r6+64*11], m17 7178 mova [r6+64*12], m18 7179 mova [r6+64*13], m19 7180 mova [r6+64*14], m20 7181 mova [r6+64*15], m21 7182 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 7183 vpbroadcastd m13, [o(pw_8192)] 7184 mova [r6+64*16], m29 7185 mova [r6+64*17], m28 7186 mova [r6+64*18], m27 7187 mova [r6+64*19], m26 7188 mova [r6+64*20], m25 7189 mova [r6+64*21], m24 7190 mova [r6+64*22], m23 7191 mova [r6+64*23], m22 7192 mova [r6+64*24], m21 7193 mova [r6+64*25], m20 7194 mova [r6+64*26], m19 7195 mova [r6+64*27], m18 7196 mova [r6+64*28], m17 7197 mova [r6+64*29], m16 7198 mova [r6+64*30], m15 7199 mova [r6+64*31], m14 7200 pmulhrsw m15, m13, [r4+64* 8] ; 1 9 17 25 7201 pmulhrsw m16, m13, [r4+64*12] 7202 pmulhrsw m17, m13, [r4+64*16] 7203 pmulhrsw m18, m13, [r4+64*20] 7204 pmulhrsw m19, m13, [r4+64*11] ; 7 15 23 31 7205 pmulhrsw m20, m13, [r4+64*15] 7206 pmulhrsw m21, m13, [r4+64*19] 7207 pmulhrsw m22, m13, [r4+64*23] 7208 vinserti32x8 m14, m15, ym16, 1 ; a1 a9 c1 c9 7209 vshufi32x4 m15, m16, q3232 ; a17 a25 c17 c25 7210 vinserti32x8 m16, m17, ym18, 1 ; e1 e9 g1 g9 7211 vshufi32x4 m17, m18, q3232 ; e17 e25 g17 g25 7212 pmulhrsw m23, m13, [r4+64*10] ; 5 13 21 29 7213 pmulhrsw m24, m13, [r4+64*14] 7214 pmulhrsw m25, m13, [r4+64*18] 7215 pmulhrsw m26, m13, [r4+64*22] 7216 vinserti32x8 m18, m19, ym20, 1 ; a7 a15 c7 c15 7217 vshufi32x4 m19, m20, q3232 ; a23 a31 c23 c31 7218 vinserti32x8 m20, m21, ym22, 1 ; e7 e15 g7 g15 7219 vshufi32x4 m21, m22, q3232 ; e23 e31 g23 g31 7220 pmulhrsw m27, m13, [r4+64* 9] ; 3 11 19 27 7221 pmulhrsw m28, m13, [r4+64*13] 7222 pmulhrsw m29, m13, [r4+64*17] 7223 pmulhrsw m13, [r4+64*21] 7224 vshufi32x4 m0, m14, m16, q2020 ; 1 7225 vshufi32x4 m1, m19, m21, q3131 ; 31 7226 vshufi32x4 m2, m15, m17, q2020 ; 17 7227 vshufi32x4 m3, m18, m20, q3131 ; 15 7228 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7229 vshufi32x4 m0, m18, m20, q2020 ; 7 7230 vshufi32x4 m1, m15, m17, q3131 ; 25 7231 vshufi32x4 m2, m19, m21, q2020 ; 23 7232 vshufi32x4 m3, m14, m16, q3131 ; 9 7233 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7234 vinserti32x8 m22, m23, ym24, 1 ; a5 a13 c5 c13 7235 vshufi32x4 m23, m24, q3232 ; a21 a29 c21 c29 7236 vinserti32x8 m24, m25, ym26, 1 ; e5 e13 g5 g13 7237 vshufi32x4 m25, m26, q3232 ; e21 e29 g21 g29 7238 vinserti32x8 m26, m27, ym28, 1 ; a3 a11 c3 c11 7239 vshufi32x4 m27, m28, q3232 ; a19 a27 c19 c27 7240 vinserti32x8 m28, m29, ym13, 1 ; e3 e11 g3 g11 7241 vshufi32x4 m29, m13, q3232 ; e19 e17 g19 g27 7242 vshufi32x4 m0, m22, m24, q2020 ; 5 7243 vshufi32x4 m1, m27, m29, q3131 ; 27 7244 vshufi32x4 m2, m23, m25, q2020 ; 21 7245 vshufi32x4 m3, m26, m28, q3131 ; 11 7246 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7247 vshufi32x4 m0, m26, m28, q2020 ; 3 7248 vshufi32x4 m1, m23, m25, q3131 ; 29 7249 vshufi32x4 m2, m27, m29, q2020 ; 19 7250 vshufi32x4 m3, m22, m24, q3131 ; 13 7251 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7252 jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 7253ALIGN function_align 7254.pass2_fast: 7255 vshufi32x4 m23, m1, m16, q3131 ; 6 7256 vshufi32x4 m22, m1, m16, q2020 ; 2 7257 vshufi32x4 m14, m0, m3, q3131 ; 4 7258 vshufi32x4 m26, m0, m3, q2020 ; 0 7259 vshufi32x4 m28, m9, m15, q3131 ; 5 7260 vshufi32x4 m0, m9, m15, q2020 ; 1 7261 vshufi32x4 m16, m11, m17, q3131 ; 7 7262 vshufi32x4 m29, m11, m17, q2020 ; 3 7263 vshufi32x4 m15, m8, m2, q3131 ; 12 7264 vshufi32x4 m27, m8, m2, q2020 ; 8 7265 vshufi32x4 m25, m5, m19, q3131 ; 14 7266 vshufi32x4 m24, m5, m19, q2020 ; 10 7267 vshufi32x4 m3, m6, m20, q3131 ; 15 7268 vshufi32x4 m19, m6, m20, q2020 ; 11 7269 vshufi32x4 m17, m4, m18, q3131 ; 13 7270 vshufi32x4 m18, m4, m18, q2020 ; 9 7271 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast 7272 mova m0, m16 7273 mova m3, m18 7274 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast 7275 mova m0, m28 7276 mova m3, m19 7277 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast 7278 mova m0, m29 7279 mova m3, m17 7280 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast 7281 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 7282 mova m0, m26 7283 mova m1, m27 7284 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 7285 mova [r3+64* 0], m0 7286 mova [r3+64* 1], m1 7287 mova [r3+64* 2], m2 7288 mova [r3+64* 3], m3 7289 mova [r3+64* 4], m4 7290 mova [r3+64* 5], m5 7291 mova [r3+64* 6], m6 7292 mova [r3+64* 7], m7 7293 mova [r3+64* 8], m14 7294 mova [r3+64* 9], m15 7295 mova [r3+64*10], m16 7296 mova [r3+64*11], m17 7297 mova [r3+64*12], m18 7298 mova [r3+64*13], m19 7299 mova [r3+64*14], m20 7300 mova [r3+64*15], m21 7301 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 7302 mova [r3+64*16], m29 7303 mova [r3+64*17], m28 7304 mova [r3+64*18], m27 7305 mova [r3+64*19], m26 7306 mova [r3+64*20], m25 7307 mova [r3+64*21], m24 7308 mova [r3+64*22], m23 7309 mova [r3+64*23], m22 7310 mova [r3+64*24], m21 7311 mova [r3+64*25], m20 7312 mova [r3+64*26], m19 7313 mova [r3+64*27], m18 7314 mova [r3+64*28], m17 7315 mova [r3+64*29], m16 7316 mova [r3+64*30], m15 7317 mova [r3+64*31], m14 7318 ret 7319 7320%endif ; ARCH_X86_64 7321