1;****************************************************************************** 2;* VP9 IDCT SIMD optimizations 3;* 4;* Copyright (C) 2013 Clément Bœsch <u pkh me> 5;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25%include "vp9itxfm_template.asm" 26 27SECTION_RODATA 32 28 29%macro VP9_IDCT_COEFFS 2-3 0 30const pw_m%1_%2 31times 8 dw -%1, %2 32const pw_%2_%1 33times 8 dw %2, %1 34 35%if %3 == 1 36const pw_m%2_m%1 37times 8 dw -%2, -%1 38%if %1 != %2 39const pw_m%2_%1 40times 8 dw -%2, %1 41const pw_%1_%2 42times 8 dw %1, %2 43%endif 44%endif 45 46%if %1 < 11585 47pw_m%1x2: times 16 dw -%1*2 48%elif %1 > 11585 49pw_%1x2: times 16 dw %1*2 50%else 51const pw_%1x2 52times 16 dw %1*2 53%endif 54 55%if %2 != %1 56pw_%2x2: times 16 dw %2*2 57%endif 58%endmacro 59 60VP9_IDCT_COEFFS 16364, 804 61VP9_IDCT_COEFFS 16305, 1606 62VP9_IDCT_COEFFS 16069, 3196, 1 63VP9_IDCT_COEFFS 15893, 3981 64VP9_IDCT_COEFFS 15137, 6270, 1 65VP9_IDCT_COEFFS 14811, 7005 66VP9_IDCT_COEFFS 14449, 7723 67VP9_IDCT_COEFFS 13160, 9760 68VP9_IDCT_COEFFS 11585, 11585, 1 69VP9_IDCT_COEFFS 11003, 12140 70VP9_IDCT_COEFFS 10394, 12665 71VP9_IDCT_COEFFS 9102, 13623, 1 72VP9_IDCT_COEFFS 8423, 14053 73VP9_IDCT_COEFFS 5520, 15426 74VP9_IDCT_COEFFS 4756, 15679 75VP9_IDCT_COEFFS 2404, 16207 76 77const pw_5283_13377 78times 4 dw 5283, 13377 79const pw_9929_13377 80times 4 dw 9929, 13377 81const pw_15212_m13377 82times 4 dw 15212, -13377 83const pw_15212_9929 84times 4 dw 15212, 9929 85const pw_m5283_m15212 86times 4 dw -5283, -15212 87const pw_13377x2 88times 8 dw 13377*2 89const pw_m13377_13377 90times 4 dw -13377, 13377 91const pw_13377_0 92times 4 dw 13377, 0 93 94cextern pw_8 95cextern pw_16 96cextern pw_32 97cextern pw_512 98cextern pw_1024 99cextern pw_2048 100cextern pw_m1 101cextern pd_8192 102 103SECTION .text 104 105%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2 106 punpckhwd m%4, m%2, m%1 107 punpcklwd m%2, m%1 108 pmaddwd m%3, m%4, [pw_m%5_%6] 109 pmaddwd m%4, [pw_%6_%5] 110 pmaddwd m%1, m%2, [pw_m%5_%6] 111 pmaddwd m%2, [pw_%6_%5] 112%endmacro 113 114%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round 115 SUMSUB_BA d, %1, %2, %5 116 SUMSUB_BA d, %3, %4, %5 117 paddd m%1, %6 118 paddd m%2, %6 119 paddd m%3, %6 120 paddd m%4, %6 121 psrad m%1, 14 122 psrad m%2, 14 123 psrad m%3, 14 124 psrad m%4, 14 125 packssdw m%1, m%3 126 packssdw m%2, m%4 127%endmacro 128 129%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst 130%if mmsize == 32 131 pmovzxbw m%3, [%6] 132 pmovzxbw m%4, [%6+strideq] 133%else 134 movh m%3, [%6] 135 movh m%4, [%6+strideq] 136 punpcklbw m%3, m%5 137 punpcklbw m%4, m%5 138%endif 139 paddw m%3, m%1 140 paddw m%4, m%2 141%if mmsize == 32 142 packuswb m%3, m%4 143 ; Intel... 144 vpermq m%3, m%3, q3120 145 mova [%6], xm%3 146 vextracti128 [%6+strideq], m%3, 1 147%elif mmsize == 16 148 packuswb m%3, m%4 149 movh [%6], m%3 150 movhps [%6+strideq], m%3 151%else 152 packuswb m%3, m%5 153 packuswb m%4, m%5 154 movh [%6], m%3 155 movh [%6+strideq], m%4 156%endif 157%endmacro 158 159%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg 160%assign %%y 0 161%rep %3 162%assign %%x 0 163%rep %3*2/mmsize 164 mova [%1+%%y+%%x], %4 165%assign %%x (%%x+mmsize) 166%endrep 167%assign %%y (%%y+%2) 168%endrep 169%endmacro 170 171;------------------------------------------------------------------------------------------- 172; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 173;------------------------------------------------------------------------------------------- 174 175INIT_MMX mmx 176cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob 177 mova m0, [blockq+0*8] 178 mova m1, [blockq+1*8] 179 mova m2, [blockq+2*8] 180 mova m3, [blockq+3*8] 181 psraw m0, 2 182 psraw m1, 2 183 psraw m2, 2 184 psraw m3, 2 185 186 VP9_IWHT4_1D 187 TRANSPOSE4x4W 0, 1, 2, 3, 4 188 VP9_IWHT4_1D 189 190 pxor m4, m4 191 VP9_STORE_2X 0, 1, 5, 6, 4 192 lea dstq, [dstq+strideq*2] 193 VP9_STORE_2X 2, 3, 5, 6, 4 194 ZERO_BLOCK blockq, 8, 4, m4 195 RET 196 197;------------------------------------------------------------------------------------------- 198; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 199;------------------------------------------------------------------------------------------- 200 201; 2x2 top left corner 202%macro VP9_IDCT4_2x2_1D 0 203 pmulhrsw m0, m5 ; m0=t1 204 mova m2, m0 ; m2=t0 205 mova m3, m1 206 pmulhrsw m1, m6 ; m1=t2 207 pmulhrsw m3, m7 ; m3=t3 208 VP9_IDCT4_1D_FINALIZE 209%endmacro 210 211%macro VP9_IDCT4_WRITEOUT 0 212%if cpuflag(ssse3) 213 mova m5, [pw_2048] 214 pmulhrsw m0, m5 ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 215 pmulhrsw m1, m5 216%else 217 mova m5, [pw_8] 218 paddw m0, m5 219 paddw m1, m5 220 psraw m0, 4 221 psraw m1, 4 222%endif 223 VP9_STORE_2X 0, 1, 6, 7, 4 224 lea dstq, [dstq+2*strideq] 225%if cpuflag(ssse3) 226 pmulhrsw m2, m5 227 pmulhrsw m3, m5 228%else 229 paddw m2, m5 230 paddw m3, m5 231 psraw m2, 4 232 psraw m3, 4 233%endif 234 VP9_STORE_2X 2, 3, 6, 7, 4 235%endmacro 236 237%macro IDCT_4x4_FN 1 238INIT_MMX %1 239cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob 240 241%if cpuflag(ssse3) 242 cmp eobd, 4 ; 2x2 or smaller 243 jg .idctfull 244 245 cmp eobd, 1 ; faster path for when only DC is set 246 jne .idct2x2 247%else 248 cmp eobd, 1 249 jg .idctfull 250%endif 251 252%if cpuflag(ssse3) 253 movd m0, [blockq] 254 mova m5, [pw_11585x2] 255 pmulhrsw m0, m5 256 pmulhrsw m0, m5 257%else 258 DEFINE_ARGS dst, stride, block, coef 259 movsx coefd, word [blockq] 260 imul coefd, 11585 261 add coefd, 8192 262 sar coefd, 14 263 imul coefd, 11585 264 add coefd, (8 << 14) + 8192 265 sar coefd, 14 + 4 266 movd m0, coefd 267%endif 268 pshufw m0, m0, 0 269 pxor m4, m4 270 movh [blockq], m4 271%if cpuflag(ssse3) 272 pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 273%endif 274 VP9_STORE_2X 0, 0, 6, 7, 4 275 lea dstq, [dstq+2*strideq] 276 VP9_STORE_2X 0, 0, 6, 7, 4 277 RET 278 279%if cpuflag(ssse3) 280; faster path for when only top left 2x2 block is set 281.idct2x2: 282 movd m0, [blockq+0] 283 movd m1, [blockq+8] 284 mova m5, [pw_11585x2] 285 mova m6, [pw_6270x2] 286 mova m7, [pw_15137x2] 287 VP9_IDCT4_2x2_1D 288 ; partial 2x4 transpose 289 punpcklwd m0, m1 290 punpcklwd m2, m3 291 SBUTTERFLY dq, 0, 2, 1 292 SWAP 1, 2 293 VP9_IDCT4_2x2_1D 294 pxor m4, m4 ; used for the block reset, and VP9_STORE_2X 295 movh [blockq+ 0], m4 296 movh [blockq+ 8], m4 297 VP9_IDCT4_WRITEOUT 298 RET 299%endif 300 301.idctfull: ; generic full 4x4 idct/idct 302 mova m0, [blockq+ 0] 303 mova m1, [blockq+ 8] 304 mova m2, [blockq+16] 305 mova m3, [blockq+24] 306%if cpuflag(ssse3) 307 mova m6, [pw_11585x2] 308%endif 309 mova m7, [pd_8192] ; rounding 310 VP9_IDCT4_1D 311 TRANSPOSE4x4W 0, 1, 2, 3, 4 312 VP9_IDCT4_1D 313 pxor m4, m4 ; used for the block reset, and VP9_STORE_2X 314 mova [blockq+ 0], m4 315 mova [blockq+ 8], m4 316 mova [blockq+16], m4 317 mova [blockq+24], m4 318 VP9_IDCT4_WRITEOUT 319 RET 320%endmacro 321 322IDCT_4x4_FN mmxext 323IDCT_4x4_FN ssse3 324 325;------------------------------------------------------------------------------------------- 326; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 327;------------------------------------------------------------------------------------------- 328 329%macro IADST4_FN 5 330INIT_MMX %5 331cglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob 332%if WIN64 && notcpuflag(ssse3) 333 WIN64_SPILL_XMM 8 334%endif 335 movdqa xmm5, [pd_8192] 336 mova m0, [blockq+ 0] 337 mova m1, [blockq+ 8] 338 mova m2, [blockq+16] 339 mova m3, [blockq+24] 340%if cpuflag(ssse3) 341 mova m6, [pw_11585x2] 342%endif 343%ifnidn %1%3, iadstiadst 344 movdq2q m7, xmm5 345%endif 346 VP9_%2_1D 347 TRANSPOSE4x4W 0, 1, 2, 3, 4 348 VP9_%4_1D 349 pxor m4, m4 ; used for the block reset, and VP9_STORE_2X 350 mova [blockq+ 0], m4 351 mova [blockq+ 8], m4 352 mova [blockq+16], m4 353 mova [blockq+24], m4 354 VP9_IDCT4_WRITEOUT 355 RET 356%endmacro 357 358IADST4_FN idct, IDCT4, iadst, IADST4, sse2 359IADST4_FN iadst, IADST4, idct, IDCT4, sse2 360IADST4_FN iadst, IADST4, iadst, IADST4, sse2 361 362IADST4_FN idct, IDCT4, iadst, IADST4, ssse3 363IADST4_FN iadst, IADST4, idct, IDCT4, ssse3 364IADST4_FN iadst, IADST4, iadst, IADST4, ssse3 365 366%macro SCRATCH 3 367%if ARCH_X86_64 368 SWAP %1, %2 369%else 370 mova [%3], m%1 371%endif 372%endmacro 373 374%macro UNSCRATCH 3 375%if ARCH_X86_64 376 SWAP %1, %2 377%else 378 mova m%1, [%3] 379%endif 380%endmacro 381 382;------------------------------------------------------------------------------------------- 383; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 384;------------------------------------------------------------------------------------------- 385 386%macro VP9_IDCT8_1D_FINALIZE 0 387 SUMSUB_BA w, 3, 6, 5 ; m3=t0+t7, m6=t0-t7 388 SUMSUB_BA w, 1, 2, 5 ; m1=t1+t6, m2=t1-t6 389 SUMSUB_BA w, 7, 0, 5 ; m7=t2+t5, m0=t2-t5 390 391 UNSCRATCH 5, 8, blockq+ 0 392 SCRATCH 2, 8, blockq+ 0 393 394 SUMSUB_BA w, 5, 4, 2 ; m5=t3+t4, m4=t3-t4 395 SWAP 7, 6, 2 396 SWAP 3, 5, 0 397 398%if ARCH_X86_64 399 SWAP 6, 8 400%endif 401%endmacro 402 403; x86-32 404; - in: m0/m4 is in mem 405; - out: m6 is in mem 406; x86-64: 407; - everything is in registers (m0-7) 408%macro VP9_IDCT8_1D 0 409%if ARCH_X86_64 410 SWAP 0, 8 411 SWAP 4, 9 412%endif 413 414 VP9_UNPACK_MULSUB_2W_4X 5, 3, 9102, 13623, D_8192_REG, 0, 4 ; m5=t5a, m3=t6a 415 VP9_UNPACK_MULSUB_2W_4X 1, 7, 16069, 3196, D_8192_REG, 0, 4 ; m1=t4a, m7=t7a 416 SUMSUB_BA w, 5, 1, 0 ; m5=t4a+t5a (t4), m1=t4a-t5a (t5a) 417 SUMSUB_BA w, 3, 7, 0 ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a) 418%if cpuflag(ssse3) 419 SUMSUB_BA w, 1, 7, 0 ; m1=t6a+t5a (t6), m7=t6a-t5a (t5) 420 pmulhrsw m1, W_11585x2_REG ; m1=t6 421 pmulhrsw m7, W_11585x2_REG ; m7=t5 422%else 423 VP9_UNPACK_MULSUB_2W_4X 7, 1, 11585, 11585, D_8192_REG, 0, 4 424%endif 425 VP9_UNPACK_MULSUB_2W_4X 2, 6, 15137, 6270, D_8192_REG, 0, 4 ; m2=t2a, m6=t3a 426 427 UNSCRATCH 0, 8, blockq+ 0 ; IN(0) 428 UNSCRATCH 4, 9, blockq+64 ; IN(4) 429 SCRATCH 5, 8, blockq+ 0 430 431%if cpuflag(ssse3) 432 SUMSUB_BA w, 4, 0, 5 ; m4=IN(0)+IN(4) m0=IN(0)-IN(4) 433 pmulhrsw m4, W_11585x2_REG ; m4=t0a 434 pmulhrsw m0, W_11585x2_REG ; m0=t1a 435%else 436 SCRATCH 7, 9, blockq+64 437 VP9_UNPACK_MULSUB_2W_4X 0, 4, 11585, 11585, D_8192_REG, 5, 7 438 UNSCRATCH 7, 9, blockq+64 439%endif 440 SUMSUB_BA w, 6, 4, 5 ; m6=t0a+t3a (t0), m4=t0a-t3a (t3) 441 SUMSUB_BA w, 2, 0, 5 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2) 442 443 VP9_IDCT8_1D_FINALIZE 444%endmacro 445 446%macro VP9_IDCT8_4x4_1D 0 447 pmulhrsw m0, W_11585x2_REG ; m0=t1a/t0a 448 pmulhrsw m6, m2, [pw_15137x2] ; m6=t3a 449 pmulhrsw m2, [pw_6270x2] ; m2=t2a 450 pmulhrsw m7, m1, [pw_16069x2] ; m7=t7a 451 pmulhrsw m1, [pw_3196x2] ; m1=t4a 452 pmulhrsw m5, m3, [pw_m9102x2] ; m5=t5a 453 pmulhrsw m3, [pw_13623x2] ; m3=t6a 454 SUMSUB_BA w, 5, 1, 4 ; m1=t4a+t5a (t4), m5=t4a-t5a (t5a) 455 SUMSUB_BA w, 3, 7, 4 ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a) 456 SUMSUB_BA w, 1, 7, 4 ; m1=t6a+t5a (t6), m7=t6a-t5a (t5) 457 pmulhrsw m1, W_11585x2_REG ; m1=t6 458 pmulhrsw m7, W_11585x2_REG ; m7=t5 459 psubw m4, m0, m6 ; m4=t0a-t3a (t3) 460 paddw m6, m0 ; m6=t0a+t3a (t0) 461 SCRATCH 5, 8, blockq+ 0 462 SUMSUB_BA w, 2, 0, 5 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2) 463 VP9_IDCT8_1D_FINALIZE 464%endmacro 465 466%macro VP9_IDCT8_2x2_1D 1 467 pmulhrsw m0, W_11585x2_REG ; m0=t0 468 pmulhrsw m3, m1, W_16069x2_REG ; m3=t7 469 pmulhrsw m1, W_3196x2_REG ; m1=t4 470 psubw m7, m3, m1 ; t5 = t7a - t4a 471 paddw m5, m3, m1 ; t6 = t7a + t4a 472 pmulhrsw m7, W_11585x2_REG ; m7=t5 473 pmulhrsw m5, W_11585x2_REG ; m5=t6 474 SWAP 5, 1 475 ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier 476 psubw m6, m0, m3 ; m6=t0-t7 477 paddw m3, m0 ; m3=t0+t7 478 psubw m2, m0, m1 ; m2=t1-t6 479 paddw m1, m0 ; m1=t1+t6 480%if %1 == 1 481 punpcklwd m3, m1 482%define SCRATCH_REG 1 483%elif ARCH_X86_32 484 mova [blockq+ 0], m2 485%define SCRATCH_REG 2 486%else 487%define SCRATCH_REG 8 488%endif 489 psubw m4, m0, m5 ; m4=t3-t4 490 paddw m5, m0 ; m5=t3+t4 491 SUMSUB_BA w, 7, 0, SCRATCH_REG ; m7=t2+t5, m0=t2-t5 492 SWAP 7, 6, 2 493 SWAP 3, 5, 0 494%undef SCRATCH_REG 495%endmacro 496 497%macro VP9_IDCT8_WRITEx2 6-8 5 ; line1, line2, tmp1, tmp2, zero, pw_1024/pw_16, shift 498%if cpuflag(ssse3) 499 pmulhrsw m%1, %6 ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5 500 pmulhrsw m%2, %6 501%else 502 paddw m%1, %6 503 paddw m%2, %6 504 psraw m%1, %7 505 psraw m%2, %7 506%endif 507%if %0 <= 7 508 VP9_STORE_2X %1, %2, %3, %4, %5 509%else 510 VP9_STORE_2X %1, %2, %3, %4, %5, %8 511%endif 512%endmacro 513 514; x86-32: 515; - m6 is in mem 516; x86-64: 517; - m8 holds m6 (SWAP) 518; m6 holds zero 519%macro VP9_IDCT8_WRITEOUT 0 520%if ARCH_X86_64 521%if cpuflag(ssse3) 522 mova m9, [pw_1024] 523%else 524 mova m9, [pw_16] 525%endif 526%define ROUND_REG m9 527%else 528%if cpuflag(ssse3) 529%define ROUND_REG [pw_1024] 530%else 531%define ROUND_REG [pw_16] 532%endif 533%endif 534 SCRATCH 5, 10, blockq+16 535 SCRATCH 7, 11, blockq+32 536 VP9_IDCT8_WRITEx2 0, 1, 5, 7, 6, ROUND_REG 537 lea dstq, [dstq+2*strideq] 538 VP9_IDCT8_WRITEx2 2, 3, 5, 7, 6, ROUND_REG 539 lea dstq, [dstq+2*strideq] 540 UNSCRATCH 5, 10, blockq+16 541 UNSCRATCH 7, 11, blockq+32 542 VP9_IDCT8_WRITEx2 4, 5, 0, 1, 6, ROUND_REG 543 lea dstq, [dstq+2*strideq] 544 UNSCRATCH 5, 8, blockq+ 0 545 VP9_IDCT8_WRITEx2 5, 7, 0, 1, 6, ROUND_REG 546 547%undef ROUND_REG 548%endmacro 549 550%macro VP9_IDCT_IDCT_8x8_ADD_XMM 2 551INIT_XMM %1 552cglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob 553 554%if cpuflag(ssse3) 555%if ARCH_X86_64 556 mova m12, [pw_11585x2] ; often used 557%define W_11585x2_REG m12 558%else 559%define W_11585x2_REG [pw_11585x2] 560%endif 561 562 cmp eobd, 12 ; top left half or less 563 jg .idctfull 564 565 cmp eobd, 3 ; top left corner or less 566 jg .idcthalf 567 568 cmp eobd, 1 ; faster path for when only DC is set 569 jne .idcttopleftcorner 570%else 571 cmp eobd, 1 572 jg .idctfull 573%endif 574 575%if cpuflag(ssse3) 576 movd m0, [blockq] 577 pmulhrsw m0, W_11585x2_REG 578 pmulhrsw m0, W_11585x2_REG 579%else 580 DEFINE_ARGS dst, stride, block, coef 581 movsx coefd, word [blockq] 582 imul coefd, 11585 583 add coefd, 8192 584 sar coefd, 14 585 imul coefd, 11585 586 add coefd, (16 << 14) + 8192 587 sar coefd, 14 + 5 588 movd m0, coefd 589%endif 590 SPLATW m0, m0, 0 591 pxor m4, m4 592 movd [blockq], m4 593%if cpuflag(ssse3) 594 pmulhrsw m0, [pw_1024] ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5 595%endif 596%rep 3 597 VP9_STORE_2X 0, 0, 6, 7, 4 598 lea dstq, [dstq+2*strideq] 599%endrep 600 VP9_STORE_2X 0, 0, 6, 7, 4 601 RET 602 603%if cpuflag(ssse3) 604; faster path for when only left corner is set (3 input: DC, right to DC, below 605; to DC). Note: also working with a 2x2 block 606.idcttopleftcorner: 607 movd m0, [blockq+0] 608 movd m1, [blockq+16] 609%if ARCH_X86_64 610 mova m10, [pw_3196x2] 611 mova m11, [pw_16069x2] 612%define W_3196x2_REG m10 613%define W_16069x2_REG m11 614%else 615%define W_3196x2_REG [pw_3196x2] 616%define W_16069x2_REG [pw_16069x2] 617%endif 618 VP9_IDCT8_2x2_1D 1 619 ; partial 2x8 transpose 620 ; punpcklwd m0, m1 already done inside idct 621 punpcklwd m2, m3 622 punpcklwd m4, m5 623 punpcklwd m6, m7 624 punpckldq m0, m2 625 punpckldq m4, m6 626 SBUTTERFLY qdq, 0, 4, 1 627 SWAP 1, 4 628 VP9_IDCT8_2x2_1D 2 629%if ARCH_X86_64 630 SWAP 6, 8 631%endif 632 pxor m6, m6 ; used for the block reset, and VP9_STORE_2X 633 VP9_IDCT8_WRITEOUT 634%if ARCH_X86_64 635 movd [blockq+ 0], m6 636 movd [blockq+16], m6 637%else 638 mova [blockq+ 0], m6 639 mova [blockq+16], m6 640 mova [blockq+32], m6 641%endif 642 RET 643 644.idcthalf: 645 movh m0, [blockq + 0] 646 movh m1, [blockq +16] 647 movh m2, [blockq +32] 648 movh m3, [blockq +48] 649 VP9_IDCT8_4x4_1D 650 ; partial 4x8 transpose 651%if ARCH_X86_32 652 mova m6, [blockq+ 0] 653%endif 654 punpcklwd m0, m1 655 punpcklwd m2, m3 656 punpcklwd m4, m5 657 punpcklwd m6, m7 658 SBUTTERFLY dq, 0, 2, 1 659 SBUTTERFLY dq, 4, 6, 5 660 SBUTTERFLY qdq, 0, 4, 1 661 SBUTTERFLY qdq, 2, 6, 5 662 SWAP 1, 4 663 SWAP 3, 6 664 VP9_IDCT8_4x4_1D 665%if ARCH_X86_64 666 SWAP 6, 8 667%endif 668 pxor m6, m6 669 VP9_IDCT8_WRITEOUT 670%if ARCH_X86_64 671 movh [blockq+ 0], m6 672 movh [blockq+16], m6 673 movh [blockq+32], m6 674%else 675 mova [blockq+ 0], m6 676 mova [blockq+16], m6 677 mova [blockq+32], m6 678%endif 679 movh [blockq+48], m6 680 RET 681%endif 682 683.idctfull: ; generic full 8x8 idct/idct 684%if ARCH_X86_64 685 mova m0, [blockq+ 0] ; IN(0) 686%endif 687 mova m1, [blockq+ 16] ; IN(1) 688 mova m2, [blockq+ 32] ; IN(2) 689 mova m3, [blockq+ 48] ; IN(3) 690%if ARCH_X86_64 691 mova m4, [blockq+ 64] ; IN(4) 692%endif 693 mova m5, [blockq+ 80] ; IN(5) 694 mova m6, [blockq+ 96] ; IN(6) 695 mova m7, [blockq+112] ; IN(7) 696%if ARCH_X86_64 697 mova m11, [pd_8192] ; rounding 698%define D_8192_REG m11 699%else 700%define D_8192_REG [pd_8192] 701%endif 702 VP9_IDCT8_1D 703%if ARCH_X86_64 704 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 705%else 706 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1 707 mova [blockq+0], m0 708%endif 709 VP9_IDCT8_1D 710 711%if ARCH_X86_64 712 SWAP 6, 8 713%endif 714 pxor m6, m6 ; used for the block reset, and VP9_STORE_2X 715 VP9_IDCT8_WRITEOUT 716 ZERO_BLOCK blockq, 16, 8, m6 717 RET 718%undef W_11585x2_REG 719%endmacro 720 721VP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12 722VP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13 723VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13 724 725;--------------------------------------------------------------------------------------------- 726; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 727;--------------------------------------------------------------------------------------------- 728 729; x86-32: 730; - in: m0/3/4/7 are in mem [blockq+N*16] 731; - out: m6 is in mem [blockq+0] 732; x86-64: 733; - everything is in registers 734%macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/4/5/6/7 735%if ARCH_X86_64 736 SWAP 0, 8 737 SWAP 3, 9 738 SWAP 4, 10 739 SWAP 7, 11 740%endif 741 742 VP9_UNPACK_MULSUB_2D_4X 5, 2, 0, 3, 14449, 7723 ; m5/2=t3[d], m2/4=t2[d] 743 VP9_UNPACK_MULSUB_2D_4X 1, 6, 4, 7, 4756, 15679 ; m1/4=t7[d], m6/7=t6[d] 744 SCRATCH 4, 12, blockq+1*16 745 VP9_RND_SH_SUMSUB_BA 6, 2, 7, 3, 4, D_8192_REG ; m6=t2[w], m2=t6[w] 746 UNSCRATCH 4, 12, blockq+1*16 747 VP9_RND_SH_SUMSUB_BA 1, 5, 4, 0, 3, D_8192_REG ; m1=t3[w], m5=t7[w] 748 749 UNSCRATCH 0, 8, blockq+16*0 750 UNSCRATCH 3, 9, blockq+16*3 751 UNSCRATCH 4, 10, blockq+16*4 752 UNSCRATCH 7, 11, blockq+16*7 753 SCRATCH 1, 8, blockq+16*1 754 SCRATCH 2, 9, blockq+16*2 755 SCRATCH 5, 10, blockq+16*5 756 SCRATCH 6, 11, blockq+16*6 757 758 VP9_UNPACK_MULSUB_2D_4X 7, 0, 1, 2, 16305, 1606 ; m7/1=t1[d], m0/2=t0[d] 759 VP9_UNPACK_MULSUB_2D_4X 3, 4, 5, 6, 10394, 12665 ; m3/5=t5[d], m4/6=t4[d] 760 SCRATCH 1, 12, blockq+ 0*16 761 VP9_RND_SH_SUMSUB_BA 4, 0, 6, 2, 1, D_8192_REG ; m4=t0[w], m0=t4[w] 762 UNSCRATCH 1, 12, blockq+ 0*16 763 VP9_RND_SH_SUMSUB_BA 3, 7, 5, 1, 2, D_8192_REG ; m3=t1[w], m7=t5[w] 764 765 UNSCRATCH 2, 9, blockq+16*2 766 UNSCRATCH 5, 10, blockq+16*5 767 SCRATCH 3, 9, blockq+16*3 768 SCRATCH 4, 10, blockq+16*4 769 770 ; m4=t0, m3=t1, m6=t2, m1=t3, m0=t4, m7=t5, m2=t6, m5=t7 771 772 VP9_UNPACK_MULSUB_2D_4X 0, 7, 1, 3, 15137, 6270 ; m0/1=t5[d], m7/3=t4[d] 773 VP9_UNPACK_MULSUB_2D_4X 5, 2, 4, 6, 6270, 15137 ; m5/4=t6[d], m2/6=t7[d] 774 SCRATCH 1, 12, blockq+ 0*16 775 VP9_RND_SH_SUMSUB_BA 5, 7, 4, 3, 1, D_8192_REG 776 UNSCRATCH 1, 12, blockq+ 0*16 777 PSIGNW m5, W_M1_REG ; m5=out1[w], m7=t6[w] 778 VP9_RND_SH_SUMSUB_BA 2, 0, 6, 1, 3, D_8192_REG ; m2=out6[w], m0=t7[w] 779 780 UNSCRATCH 1, 8, blockq+16*1 781 UNSCRATCH 3, 9, blockq+16*3 782 UNSCRATCH 4, 10, blockq+16*4 783 UNSCRATCH 6, 11, blockq+16*6 784 SCRATCH 2, 8, blockq+16*0 785 786 SUMSUB_BA w, 6, 4, 2 ; m6=out0[w], m4=t2[w] 787 SUMSUB_BA w, 1, 3, 2 788 PSIGNW m1, W_M1_REG ; m1=out7[w], m3=t3[w] 789 790 ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7 791 792 ; unfortunately, the code below overflows in some cases 793%if 0; cpuflag(ssse3) 794 SUMSUB_BA w, 3, 4, 2 795 SUMSUB_BA w, 0, 7, 2 796 pmulhrsw m3, W_11585x2_REG 797 pmulhrsw m7, W_11585x2_REG 798 pmulhrsw m4, W_11585x2_REG ; out4 799 pmulhrsw m0, W_11585x2_REG ; out2 800%else 801 SCRATCH 5, 9, blockq+16*1 802 VP9_UNPACK_MULSUB_2W_4X 4, 3, 11585, 11585, D_8192_REG, 2, 5 803 VP9_UNPACK_MULSUB_2W_4X 7, 0, 11585, 11585, D_8192_REG, 2, 5 804 UNSCRATCH 5, 9, blockq+16*1 805%endif 806 PSIGNW m3, W_M1_REG ; out3 807 PSIGNW m7, W_M1_REG ; out5 808 809 ; m6=out0, m5=out1, m0=out2, m3=out3, m4=out4, m7=out5, m2=out6, m1=out7 810 811%if ARCH_X86_64 812 SWAP 2, 8 813%endif 814 SWAP 0, 6, 2 815 SWAP 7, 1, 5 816%endmacro 817 818%macro IADST8_FN 6 819INIT_XMM %5 820cglobal vp9_%1_%3_8x8_add, 3, 3, %6, dst, stride, block, eob 821 822%ifidn %1, idct 823%define first_is_idct 1 824%else 825%define first_is_idct 0 826%endif 827 828%ifidn %3, idct 829%define second_is_idct 1 830%else 831%define second_is_idct 0 832%endif 833 834%if ARCH_X86_64 835 mova m0, [blockq+ 0] ; IN(0) 836%endif 837 mova m1, [blockq+ 16] ; IN(1) 838 mova m2, [blockq+ 32] ; IN(2) 839%if ARCH_X86_64 || first_is_idct 840 mova m3, [blockq+ 48] ; IN(3) 841%endif 842%if ARCH_X86_64 843 mova m4, [blockq+ 64] ; IN(4) 844%endif 845 mova m5, [blockq+ 80] ; IN(5) 846 mova m6, [blockq+ 96] ; IN(6) 847%if ARCH_X86_64 || first_is_idct 848 mova m7, [blockq+112] ; IN(7) 849%endif 850%if ARCH_X86_64 851%if cpuflag(ssse3) 852 mova m15, [pw_11585x2] ; often used 853%endif 854 mova m13, [pd_8192] ; rounding 855 mova m14, [pw_m1] 856%define W_11585x2_REG m15 857%define D_8192_REG m13 858%define W_M1_REG m14 859%else 860%define W_11585x2_REG [pw_11585x2] 861%define D_8192_REG [pd_8192] 862%define W_M1_REG [pw_m1] 863%endif 864 865 ; note different calling conventions for idct8 vs. iadst8 on x86-32 866 VP9_%2_1D 867%if ARCH_X86_64 868 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 869%else 870 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1 871 mova [blockq+ 0], m0 872%if second_is_idct == 0 873 mova [blockq+ 48], m3 874 mova [blockq+112], m7 875%endif 876%endif 877 VP9_%4_1D 878 879%if ARCH_X86_64 880 SWAP 6, 8 881%endif 882 pxor m6, m6 ; used for the block reset, and VP9_STORE_2X 883 VP9_IDCT8_WRITEOUT 884 ZERO_BLOCK blockq, 16, 8, m6 885 RET 886 887%undef W_11585x2_REG 888%undef first_is_idct 889%undef second_is_idct 890 891%endmacro 892 893IADST8_FN idct, IDCT8, iadst, IADST8, sse2, 15 894IADST8_FN iadst, IADST8, idct, IDCT8, sse2, 15 895IADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15 896IADST8_FN idct, IDCT8, iadst, IADST8, ssse3, 16 897IADST8_FN idct, IDCT8, iadst, IADST8, avx, 16 898IADST8_FN iadst, IADST8, idct, IDCT8, ssse3, 16 899IADST8_FN iadst, IADST8, idct, IDCT8, avx, 16 900IADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16 901IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16 902 903;--------------------------------------------------------------------------------------------- 904; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 905;--------------------------------------------------------------------------------------------- 906 907; x86-64: 908; at the end of this macro, m7 is stored in [%4+15*%5] 909; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15 910; the following sumsubs have not been done yet: 911; SUMSUB_BA w, 6, 9, 15 ; t6, t9 912; SUMSUB_BA w, 7, 8, 15 ; t7, t8 913; or (x86-32) t0-t5 are in m0-m5, t10-t15 are in x11/9/7/5/3/1, 914; and the following simsubs have not been done yet: 915; SUMSUB_BA w, x13, x14, 7 ; t6, t9 916; SUMSUB_BA w, x15, x12, 7 ; t7, t8 917 918%macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst 919%if %2 <= 4 920 mova m3, [%1+ 1*%3] ; IN(1) 921 mova m0, [%1+ 3*%3] ; IN(3) 922 923 pmulhrsw m4, m3, [pw_16305x2] ; t14-15 924 pmulhrsw m3, [pw_1606x2] ; t8-9 925 pmulhrsw m7, m0, [pw_m4756x2] ; t10-11 926 pmulhrsw m0, [pw_15679x2] ; t12-13 927 928 ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7 929 ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15 930 931 VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137, 6270, [pd_8192], 1, 6 ; t9, t14 932 SCRATCH 4, 10, %4+ 1*%5 933 SCRATCH 5, 11, %4+ 7*%5 934 VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 4, 5 ; t10, t13 935 UNSCRATCH 5, 11, %4+ 7*%5 936 937 ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 938 ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15 939%else 940 mova m5, [%1+ 1*%3] ; IN(1) 941 mova m4, [%1+ 7*%3] ; IN(7) 942%if %2 <= 8 943 pmulhrsw m2, m5, [pw_16305x2] ; t15 944 pmulhrsw m5, [pw_1606x2] ; t8 945 pmulhrsw m3, m4, [pw_m10394x2] ; t9 946 pmulhrsw m4, [pw_12665x2] ; t14 947%else 948 mova m3, [%1+ 9*%3] ; IN(9) 949 mova m2, [%1+15*%3] ; IN(15) 950 951 ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7 952 ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15 953 954 VP9_UNPACK_MULSUB_2W_4X 5, 2, 16305, 1606, [pd_8192], 0, 1 ; t8, t15 955 VP9_UNPACK_MULSUB_2W_4X 3, 4, 10394, 12665, [pd_8192], 0, 1 ; t9, t14 956%endif 957 958 SUMSUB_BA w, 3, 5, 0 ; t8, t9 959 SUMSUB_BA w, 4, 2, 0 ; t15, t14 960 961 VP9_UNPACK_MULSUB_2W_4X 2, 5, 15137, 6270, [pd_8192], 0, 1 ; t9, t14 962 963 SCRATCH 4, 10, %4+ 1*%5 964 SCRATCH 5, 11, %4+ 7*%5 965 966 mova m6, [%1+ 3*%3] ; IN(3) 967 mova m7, [%1+ 5*%3] ; IN(5) 968%if %2 <= 8 969 pmulhrsw m0, m7, [pw_14449x2] ; t13 970 pmulhrsw m7, [pw_7723x2] ; t10 971 pmulhrsw m1, m6, [pw_m4756x2] ; t11 972 pmulhrsw m6, [pw_15679x2] ; t12 973%else 974 mova m0, [%1+11*%3] ; IN(11) 975 mova m1, [%1+13*%3] ; IN(13) 976 977 VP9_UNPACK_MULSUB_2W_4X 7, 0, 14449, 7723, [pd_8192], 4, 5 ; t10, t13 978 VP9_UNPACK_MULSUB_2W_4X 1, 6, 4756, 15679, [pd_8192], 4, 5 ; t11, t12 979%endif 980 981 ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7 982 ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15 983 984 SUMSUB_BA w, 7, 1, 4 ; t11, t10 985 SUMSUB_BA w, 0, 6, 4 ; t12, t13 986 987 ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7 988 ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15 989 990 VP9_UNPACK_MULSUB_2W_4X 6, 1, 6270, m15137, [pd_8192], 4, 5 ; t10, t13 991 992 UNSCRATCH 5, 11, %4+ 7*%5 993%endif 994 995 ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7 996 ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15 997 998 SUMSUB_BA w, 7, 3, 4 ; t8, t11 999 1000 ; backup first register 1001 mova [%4+15*%5], m7 1002 1003 SUMSUB_BA w, 6, 2, 7 ; t9, t10 1004 UNSCRATCH 4, 10, %4+ 1*%5 1005 SUMSUB_BA w, 0, 4, 7 ; t15, t12 1006 SUMSUB_BA w, 1, 5, 7 ; t14. t13 1007 1008 ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 1009 ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15 1010 1011%if cpuflag(ssse3) && %6 == 0 1012 SUMSUB_BA w, 2, 5, 7 1013 SUMSUB_BA w, 3, 4, 7 1014 pmulhrsw m5, [pw_11585x2] ; t10 1015 pmulhrsw m4, [pw_11585x2] ; t11 1016 pmulhrsw m3, [pw_11585x2] ; t12 1017 pmulhrsw m2, [pw_11585x2] ; t13 1018%else 1019 SCRATCH 6, 10, %4+ 1*%5 1020 VP9_UNPACK_MULSUB_2W_4X 5, 2, 11585, 11585, [pd_8192], 6, 7 ; t10, t13 1021 VP9_UNPACK_MULSUB_2W_4X 4, 3, 11585, 11585, [pd_8192], 6, 7 ; t11, t12 1022 UNSCRATCH 6, 10, %4+ 1*%5 1023%endif 1024 1025 ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 1026 ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15 1027 1028 SCRATCH 0, 8, %4+ 1*%5 1029 SCRATCH 1, 9, %4+ 3*%5 1030 SCRATCH 2, 10, %4+ 5*%5 1031 SCRATCH 3, 11, %4+ 7*%5 1032 SCRATCH 4, 12, %4+ 9*%5 1033 SCRATCH 5, 13, %4+11*%5 1034 SCRATCH 6, 14, %4+13*%5 1035 1036 ; even (tx8x8) 1037%if %2 <= 4 1038 mova m3, [%1+ 0*%3] ; IN(0) 1039 mova m4, [%1+ 2*%3] ; IN(2) 1040 1041 pmulhrsw m3, [pw_11585x2] ; t0-t3 1042 pmulhrsw m7, m4, [pw_16069x2] ; t6-7 1043 pmulhrsw m4, [pw_3196x2] ; t4-5 1044 1045%if 0 ; overflows :( 1046 paddw m6, m7, m4 1047 psubw m5, m7, m4 1048 pmulhrsw m5, [pw_11585x2] ; t5 1049 pmulhrsw m6, [pw_11585x2] ; t6 1050%else 1051 VP9_UNPACK_MULSUB_2W_4X 5, 6, 7, 4, 11585, 11585, [pd_8192], 0, 1 ; t5, t6 1052%endif 1053 1054 psubw m0, m3, m7 1055 paddw m7, m3 1056 psubw m1, m3, m6 1057 paddw m6, m3 1058 psubw m2, m3, m5 1059 paddw m5, m3 1060 1061%if ARCH_X86_32 1062 SWAP 0, 7 1063%endif 1064 SCRATCH 7, 15, %4+12*%5 1065%else 1066 mova m6, [%1+ 2*%3] ; IN(2) 1067 mova m1, [%1+ 4*%3] ; IN(4) 1068 mova m7, [%1+ 6*%3] ; IN(6) 1069%if %2 <= 8 1070 pmulhrsw m0, m1, [pw_15137x2] ; t3 1071 pmulhrsw m1, [pw_6270x2] ; t2 1072 pmulhrsw m5, m6, [pw_16069x2] ; t7 1073 pmulhrsw m6, [pw_3196x2] ; t4 1074 pmulhrsw m4, m7, [pw_m9102x2] ; t5 1075 pmulhrsw m7, [pw_13623x2] ; t6 1076%else 1077 mova m4, [%1+10*%3] ; IN(10) 1078 mova m0, [%1+12*%3] ; IN(12) 1079 mova m5, [%1+14*%3] ; IN(14) 1080 1081 VP9_UNPACK_MULSUB_2W_4X 1, 0, 15137, 6270, [pd_8192], 2, 3 ; t2, t3 1082 VP9_UNPACK_MULSUB_2W_4X 6, 5, 16069, 3196, [pd_8192], 2, 3 ; t4, t7 1083 VP9_UNPACK_MULSUB_2W_4X 4, 7, 9102, 13623, [pd_8192], 2, 3 ; t5, t6 1084%endif 1085 1086 SUMSUB_BA w, 4, 6, 2 ; t4, t5 1087 SUMSUB_BA w, 7, 5, 2 ; t7, t6 1088 1089%if cpuflag(ssse3) && %6 == 0 1090 SUMSUB_BA w, 6, 5, 2 1091 pmulhrsw m5, [pw_11585x2] ; t5 1092 pmulhrsw m6, [pw_11585x2] ; t6 1093%else 1094 VP9_UNPACK_MULSUB_2W_4X 5, 6, 11585, 11585, [pd_8192], 2, 3 ; t5, t6 1095%endif 1096 1097 SCRATCH 5, 15, %4+10*%5 1098 mova m2, [%1+ 0*%3] ; IN(0) 1099%if %2 <= 8 1100 pmulhrsw m2, [pw_11585x2] ; t0 and t1 1101 psubw m3, m2, m0 1102 paddw m0, m2 1103 1104 SUMSUB_BA w, 7, 0, 5 ; t0, t7 1105%else 1106 mova m3, [%1+ 8*%3] ; IN(8) 1107 1108 ; from 3 stages back 1109%if cpuflag(ssse3) && %6 == 0 1110 SUMSUB_BA w, 3, 2, 5 1111 pmulhrsw m3, [pw_11585x2] ; t0 1112 pmulhrsw m2, [pw_11585x2] ; t1 1113%else 1114 mova [%1+ 0*%3], m0 1115 VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 0 ; t0, t1 1116 mova m0, [%1+ 0*%3] 1117%endif 1118 1119 ; from 2 stages back 1120 SUMSUB_BA w, 0, 3, 5 ; t0, t3 1121 1122 SUMSUB_BA w, 7, 0, 5 ; t0, t7 1123%endif 1124 UNSCRATCH 5, 15, %4+10*%5 1125%if ARCH_X86_32 1126 SWAP 0, 7 1127%endif 1128 SCRATCH 7, 15, %4+12*%5 1129 SUMSUB_BA w, 1, 2, 7 ; t1, t2 1130 1131 ; from 1 stage back 1132 SUMSUB_BA w, 6, 1, 7 ; t1, t6 1133 SUMSUB_BA w, 5, 2, 7 ; t2, t5 1134%endif 1135 SUMSUB_BA w, 4, 3, 7 ; t3, t4 1136 1137%if ARCH_X86_64 1138 SWAP 0, 8 1139 SWAP 1, 9 1140 SWAP 2, 10 1141 SWAP 3, 11 1142 SWAP 4, 12 1143 SWAP 5, 13 1144 SWAP 6, 14 1145 1146 SUMSUB_BA w, 0, 15, 7 ; t0, t15 1147 SUMSUB_BA w, 1, 14, 7 ; t1, t14 1148 SUMSUB_BA w, 2, 13, 7 ; t2, t13 1149 SUMSUB_BA w, 3, 12, 7 ; t3, t12 1150 SUMSUB_BA w, 4, 11, 7 ; t4, t11 1151 SUMSUB_BA w, 5, 10, 7 ; t5, t10 1152%else 1153 SWAP 1, 6 1154 SWAP 2, 5 1155 SWAP 3, 4 1156 mova [%4+14*%5], m6 1157 1158%macro %%SUMSUB_BA_STORE 5 ; reg, from_mem, to_mem, scratch, scratch_stride 1159 mova m6, [%4+%2*%5] 1160 SUMSUB_BA w, 6, %1, 7 1161 SWAP %1, 6 1162 mova [%4+%3*%5], m6 1163%endmacro 1164 1165 %%SUMSUB_BA_STORE 0, 1, 1, %4, %5 ; t0, t15 1166 %%SUMSUB_BA_STORE 1, 3, 3, %4, %5 ; t1, t14 1167 %%SUMSUB_BA_STORE 2, 5, 5, %4, %5 ; t2, t13 1168 %%SUMSUB_BA_STORE 3, 7, 7, %4, %5 ; t3, t12 1169 %%SUMSUB_BA_STORE 4, 9, 9, %4, %5 ; t4, t11 1170 %%SUMSUB_BA_STORE 5, 11, 11, %4, %5 ; t5, t10 1171%endif 1172%endmacro 1173 1174%macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst 1175%if %2 == 1 1176 VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4 1177 1178%if ARCH_X86_64 1179 ; backup a different register 1180 mova m7, [tmpq+15*16] 1181 mova [tmpq+ 1*16], m15 1182 1183 SUMSUB_BA w, 6, 9, 15 ; t6, t9 1184 SUMSUB_BA w, 7, 8, 15 ; t7, t8 1185 1186 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 15 1187 mova [tmpq+ 0], m0 1188 mova [tmpq+ 32], m1 1189 mova [tmpq+ 64], m2 1190 mova [tmpq+ 96], m3 1191 mova [tmpq+128], m4 1192 mova [tmpq+160], m5 1193 mova [tmpq+192], m6 1194 mova [tmpq+224], m7 1195 1196 mova m15, [tmpq+ 1*16] 1197 TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 1198 mova [tmpq+ 16], m8 1199 mova [tmpq+ 48], m9 1200 mova [tmpq+ 80], m10 1201 mova [tmpq+112], m11 1202 mova [tmpq+144], m12 1203 mova [tmpq+176], m13 1204 mova [tmpq+208], m14 1205 mova [tmpq+240], m15 1206%else 1207 mova m6, [tmpq+13*16] 1208 mova m7, [tmpq+14*16] 1209 SUMSUB_BA w, 6, 7 ; t6, t9 1210 mova [tmpq+14*16], m6 1211 mova [tmpq+13*16], m7 1212 mova m7, [tmpq+15*16] 1213 mova m6, [tmpq+12*16] 1214 SUMSUB_BA w, 7, 6 ; t7, t8 1215 mova [tmpq+15*16], m6 1216 1217 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+14*16], [tmpq+ 8*16], 1 1218 mova [tmpq+ 0*16], m0 1219 mova [tmpq+ 2*16], m1 1220 mova [tmpq+ 4*16], m2 1221 mova [tmpq+ 6*16], m3 1222 mova [tmpq+10*16], m5 1223 mova [tmpq+12*16], m6 1224 mova [tmpq+14*16], m7 1225 1226 mova m0, [tmpq+15*16] 1227 mova m1, [tmpq+13*16] 1228 mova m2, [tmpq+11*16] 1229 mova m3, [tmpq+ 9*16] 1230 mova m4, [tmpq+ 7*16] 1231 mova m5, [tmpq+ 5*16] 1232 mova m7, [tmpq+ 1*16] 1233 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+ 3*16], [tmpq+ 9*16], 1 1234 mova [tmpq+ 1*16], m0 1235 mova [tmpq+ 3*16], m1 1236 mova [tmpq+ 5*16], m2 1237 mova [tmpq+ 7*16], m3 1238 mova [tmpq+11*16], m5 1239 mova [tmpq+13*16], m6 1240 mova [tmpq+15*16], m7 1241%endif 1242%else ; %2 == 2 1243 VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4 1244 1245%if cpuflag(ssse3) 1246%define ROUND_REG [pw_512] 1247%else 1248%define ROUND_REG [pw_32] 1249%endif 1250 1251 pxor m7, m7 1252%if ARCH_X86_64 1253 ; backup more registers 1254 mova [%1+ 2*32], m8 1255 mova [%1+ 3*32], m9 1256 1257 VP9_IDCT8_WRITEx2 0, 1, 8, 9, 7, ROUND_REG, 6 1258 lea dstq, [dstq+strideq*2] 1259 VP9_IDCT8_WRITEx2 2, 3, 8, 9, 7, ROUND_REG, 6 1260 lea dstq, [dstq+strideq*2] 1261 VP9_IDCT8_WRITEx2 4, 5, 8, 9, 7, ROUND_REG, 6 1262 lea dstq, [dstq+strideq*2] 1263 1264 ; restore from cache 1265 SWAP 0, 7 ; move zero from m7 to m0 1266 mova m7, [%1+15*32] 1267 mova m8, [%1+ 2*32] 1268 mova m9, [%1+ 3*32] 1269 1270 SUMSUB_BA w, 6, 9, 3 ; t6, t9 1271 SUMSUB_BA w, 7, 8, 3 ; t7, t8 1272 1273 VP9_IDCT8_WRITEx2 6, 7, 3, 4, 0, ROUND_REG, 6 1274 lea dstq, [dstq+strideq*2] 1275 VP9_IDCT8_WRITEx2 8, 9, 3, 4, 0, ROUND_REG, 6 1276 lea dstq, [dstq+strideq*2] 1277 VP9_IDCT8_WRITEx2 10, 11, 1, 2, 0, ROUND_REG, 6 1278 lea dstq, [dstq+strideq*2] 1279 VP9_IDCT8_WRITEx2 12, 13, 1, 2, 0, ROUND_REG, 6 1280 lea dstq, [dstq+strideq*2] 1281 VP9_IDCT8_WRITEx2 14, 15, 1, 2, 0, ROUND_REG, 6 1282%else 1283 mova [tmpq+ 0*32], m5 1284 1285 VP9_IDCT8_WRITEx2 0, 1, 5, 6, 7, ROUND_REG, 6 1286 lea dstq, [dstq+strideq*2] 1287 VP9_IDCT8_WRITEx2 2, 3, 5, 6, 7, ROUND_REG, 6 1288 lea dstq, [dstq+strideq*2] 1289 1290 SWAP 0, 7 ; move zero from m7 to m0 1291 mova m5, [tmpq+ 0*32] 1292 1293 VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 1294 lea dstq, [dstq+strideq*2] 1295 1296 mova m4, [tmpq+13*32] 1297 mova m7, [tmpq+14*32] 1298 mova m5, [tmpq+15*32] 1299 mova m6, [tmpq+12*32] 1300 SUMSUB_BADC w, 4, 7, 5, 6, 1 1301 1302 VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 1303 lea dstq, [dstq+strideq*2] 1304 VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, ROUND_REG, 6 1305 lea dstq, [dstq+strideq*2] 1306 1307 mova m4, [tmpq+11*32] 1308 mova m5, [tmpq+ 9*32] 1309 mova m6, [tmpq+ 7*32] 1310 mova m7, [tmpq+ 5*32] 1311 1312 VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 1313 lea dstq, [dstq+strideq*2] 1314 VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, ROUND_REG, 6 1315 lea dstq, [dstq+strideq*2] 1316 1317 mova m4, [tmpq+ 3*32] 1318 mova m5, [tmpq+ 1*32] 1319 1320 VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 1321 lea dstq, [dstq+strideq*2] 1322%endif 1323 1324%undef ROUND_REG 1325%endif ; %2 == 1/2 1326%endmacro 1327 1328%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride 1329 mova m%3, [dstq] 1330 mova m%5, [dstq+%7] 1331 punpcklbw m%2, m%3, m%6 1332 punpckhbw m%3, m%6 1333 punpcklbw m%4, m%5, m%6 1334 punpckhbw m%5, m%6 1335 paddw m%2, m%1 1336 paddw m%3, m%1 1337 paddw m%4, m%1 1338 paddw m%5, m%1 1339 packuswb m%2, m%3 1340 packuswb m%4, m%5 1341 mova [dstq], m%2 1342 mova [dstq+%7], m%4 1343%endmacro 1344 1345%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1 1346INIT_XMM %1 1347cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob 1348%if cpuflag(ssse3) 1349 ; 2x2=eob=3, 4x4=eob=10 1350 cmp eobd, 38 1351 jg .idctfull 1352 cmp eobd, 1 ; faster path for when only DC is set 1353 jne .idct8x8 1354%else 1355 cmp eobd, 1 ; faster path for when only DC is set 1356 jg .idctfull 1357%endif 1358 1359 ; dc-only 1360%if cpuflag(ssse3) 1361 movd m0, [blockq] 1362 mova m1, [pw_11585x2] 1363 pmulhrsw m0, m1 1364 pmulhrsw m0, m1 1365%else 1366 DEFINE_ARGS dst, stride, block, coef 1367 movsx coefd, word [blockq] 1368 imul coefd, 11585 1369 add coefd, 8192 1370 sar coefd, 14 1371 imul coefd, 11585 1372 add coefd, (32 << 14) + 8192 1373 sar coefd, 14 + 6 1374 movd m0, coefd 1375%endif 1376 SPLATW m0, m0, q0000 1377%if cpuflag(ssse3) 1378 pmulhrsw m0, [pw_512] 1379%endif 1380 pxor m5, m5 1381 movd [blockq], m5 1382%rep 7 1383 VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5 1384 lea dstq, [dstq+2*strideq] 1385%endrep 1386 VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5 1387 RET 1388 1389 DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp 1390%if cpuflag(ssse3) 1391.idct8x8: 1392 mov tmpq, rsp 1393 VP9_IDCT16_1D blockq, 1, 8, 0 1394 1395 mov cntd, 2 1396 mov dst_bakq, dstq 1397.loop2_8x8: 1398 VP9_IDCT16_1D tmpq, 2, 8, 0 1399 lea dstq, [dst_bakq+8] 1400 add tmpq, 16 1401 dec cntd 1402 jg .loop2_8x8 1403 1404 ; at the end of the loop, m0 should still be zero 1405 ; use that to zero out block coefficients 1406 ZERO_BLOCK blockq, 32, 8, m0 1407 RET 1408%endif 1409 1410.idctfull: 1411 mov cntd, 2 1412 mov tmpq, rsp 1413.loop1_full: 1414 VP9_IDCT16_1D blockq, 1, 16, 0 1415 add blockq, 16 1416 add tmpq, 256 1417 dec cntd 1418 jg .loop1_full 1419 sub blockq, 32 1420 1421 mov cntd, 2 1422 mov tmpq, rsp 1423 mov dst_bakq, dstq 1424.loop2_full: 1425 VP9_IDCT16_1D tmpq, 2, 16, 0 1426 lea dstq, [dst_bakq+8] 1427 add tmpq, 16 1428 dec cntd 1429 jg .loop2_full 1430 1431 ; at the end of the loop, m0 should still be zero 1432 ; use that to zero out block coefficients 1433 ZERO_BLOCK blockq, 32, 16, m0 1434 RET 1435%endmacro 1436 1437VP9_IDCT_IDCT_16x16_ADD_XMM sse2 1438VP9_IDCT_IDCT_16x16_ADD_XMM ssse3 1439VP9_IDCT_IDCT_16x16_ADD_XMM avx 1440 1441%macro VP9_IDCT16_YMM_1D 0 1442 VP9_UNPACK_MULSUB_2W_4X 1, 15, 16305, 1606, [pd_8192], 0, 4 ; t8, t15 1443 VP9_UNPACK_MULSUB_2W_4X 9, 7, 10394, 12665, [pd_8192], 0, 4 ; t9, t14 1444 1445 SUMSUB_BA w, 9, 1, 0 ; t8, t9 1446 SUMSUB_BA w, 7, 15, 0 ; t15, t14 1447 1448 VP9_UNPACK_MULSUB_2W_4X 15, 1, 15137, 6270, [pd_8192], 0, 4 ; t9, t14 1449 1450 VP9_UNPACK_MULSUB_2W_4X 5, 11, 14449, 7723, [pd_8192], 0, 4 ; t10, t13 1451 VP9_UNPACK_MULSUB_2W_4X 13, 3, 4756, 15679, [pd_8192], 0, 4 ; t11, t12 1452 1453 SUMSUB_BA w, 5, 13, 0 ; t11, t10 1454 SUMSUB_BA w, 11, 3, 0 ; t12, t13 1455 1456 VP9_UNPACK_MULSUB_2W_4X 3, 13, 6270, m15137, [pd_8192], 0, 4 ; t10, t13 1457 1458 SUMSUB_BA w, 5, 9, 0 ; t8, t11 1459 SUMSUB_BA w, 3, 15, 0 ; t9, t10 1460 SUMSUB_BA w, 11, 7, 0 ; t15, t12 1461 SUMSUB_BA w, 13, 1, 0 ; t14, t13 1462 1463 SUMSUB_BA w, 15, 1, 0 1464 SUMSUB_BA w, 9, 7, 0 1465 pmulhrsw m1, [pw_11585x2] ; t10 1466 pmulhrsw m7, [pw_11585x2] ; t11 1467 pmulhrsw m9, [pw_11585x2] ; t12 1468 pmulhrsw m15, [pw_11585x2] ; t13 1469 1470 ; even (tx8x8) 1471 mova m4, [blockq+128] 1472 mova [blockq+128], m5 1473 VP9_UNPACK_MULSUB_2W_4X 4, 12, 15137, 6270, [pd_8192], 0, 5 ; t2, t3 1474 VP9_UNPACK_MULSUB_2W_4X 2, 14, 16069, 3196, [pd_8192], 0, 5 ; t4, t7 1475 VP9_UNPACK_MULSUB_2W_4X 10, 6, 9102, 13623, [pd_8192], 0, 5 ; t5, t6 1476 mova m0, [blockq+ 0] 1477 SUMSUB_BA w, 8, 0, 5 1478 pmulhrsw m8, [pw_11585x2] ; t0 1479 pmulhrsw m0, [pw_11585x2] ; t1 1480 1481 SUMSUB_BA w, 10, 2, 5 ; t4, t5 1482 SUMSUB_BA w, 6, 14, 5 ; t7, t6 1483 SUMSUB_BA w, 12, 8, 5 ; t0, t3 1484 SUMSUB_BA w, 4, 0, 5 ; t1, t2 1485 1486 SUMSUB_BA w, 2, 14, 5 1487 pmulhrsw m14, [pw_11585x2] ; t5 1488 pmulhrsw m2, [pw_11585x2] ; t6 1489 1490 SUMSUB_BA w, 6, 12, 5 ; t0, t7 1491 SUMSUB_BA w, 2, 4, 5 ; t1, t6 1492 SUMSUB_BA w, 14, 0, 5 ; t2, t5 1493 SUMSUB_BA w, 10, 8, 5 ; t3, t4 1494 1495 ; final stage 1496 SUMSUB_BA w, 11, 6, 5 ; out0, out15 1497 SUMSUB_BA w, 13, 2, 5 ; out1, out14 1498 SUMSUB_BA w, 15, 14, 5 ; out2, out13 1499 SUMSUB_BA w, 9, 10, 5 ; out3, out12 1500 SUMSUB_BA w, 7, 8, 5 ; out4, out11 1501 SUMSUB_BA w, 1, 0, 5 ; out5, out10 1502 SUMSUB_BA w, 3, 4, 5 ; out6, out9 1503 mova m5, [blockq+128] 1504 mova [blockq+192], m3 1505 SUMSUB_BA w, 5, 12, 3 ; out7, out8 1506 1507 SWAP 0, 11, 8, 12, 10 1508 SWAP 1, 13, 14, 2, 15, 6, 3, 9, 4, 7, 5 1509%endmacro 1510 1511; this is almost identical to VP9_STORE_2X, but it does two rows 1512; for slightly improved interleaving, and it omits vpermq since the 1513; input is DC so all values are identical 1514%macro VP9_STORE_YMM_DC_4X 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero 1515 mova xm%2, [dstq] 1516 mova xm%4, [dstq+strideq*2] 1517 vinserti128 m%2, m%2, [dstq+strideq], 1 1518 vinserti128 m%4, m%4, [dstq+stride3q], 1 1519 punpckhbw m%3, m%2, m%6 1520 punpcklbw m%2, m%6 1521 punpckhbw m%5, m%4, m%6 1522 punpcklbw m%4, m%6 1523 paddw m%3, m%1 1524 paddw m%2, m%1 1525 paddw m%5, m%1 1526 paddw m%4, m%1 1527 packuswb m%2, m%3 1528 packuswb m%4, m%5 1529 mova [dstq], xm%2 1530 mova [dstq+strideq*2], xm%4 1531 vextracti128 [dstq+strideq], m%2, 1 1532 vextracti128 [dstq+stride3q], m%4, 1 1533%endmacro 1534 1535%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 1536INIT_YMM avx2 1537cglobal vp9_idct_idct_16x16_add, 4, 4, 16, dst, stride, block, eob 1538 cmp eobd, 1 ; faster path for when only DC is set 1539 jg .idctfull 1540 1541 ; dc-only 1542 mova m1, [pw_11585x2] 1543 vpbroadcastw m0, [blockq] 1544 pmulhrsw m0, m1 1545 pmulhrsw m0, m1 1546 pxor m5, m5 1547 pmulhrsw m0, [pw_512] 1548 movd [blockq], xm5 1549 1550 DEFINE_ARGS dst, stride, stride3, cnt 1551 mov cntd, 4 1552 lea stride3q, [strideq*3] 1553.loop_dc: 1554 VP9_STORE_YMM_DC_4X 0, 1, 2, 3, 4, 5 1555 lea dstq, [dstq+4*strideq] 1556 dec cntd 1557 jg .loop_dc 1558 RET 1559 1560 DEFINE_ARGS dst, stride, block, eob 1561.idctfull: 1562 mova m1, [blockq+ 32] 1563 mova m2, [blockq+ 64] 1564 mova m3, [blockq+ 96] 1565 mova m5, [blockq+160] 1566 mova m6, [blockq+192] 1567 mova m7, [blockq+224] 1568 mova m8, [blockq+256] 1569 mova m9, [blockq+288] 1570 mova m10, [blockq+320] 1571 mova m11, [blockq+352] 1572 mova m12, [blockq+384] 1573 mova m13, [blockq+416] 1574 mova m14, [blockq+448] 1575 mova m15, [blockq+480] 1576 1577 VP9_IDCT16_YMM_1D 1578 TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ 1579 [blockq+192], [blockq+128], 1 1580 mova [blockq+ 0], m0 1581 VP9_IDCT16_YMM_1D 1582 1583 mova [blockq+224], m7 1584 1585 ; store 1586 VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6 1587 lea dstq, [dstq+2*strideq] 1588 VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6 1589 lea dstq, [dstq+2*strideq] 1590 VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6 1591 lea dstq, [dstq+2*strideq] 1592 mova m6, [blockq+192] 1593 mova m7, [blockq+224] 1594 VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6 1595 lea dstq, [dstq+2*strideq] 1596 VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6 1597 lea dstq, [dstq+2*strideq] 1598 VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6 1599 lea dstq, [dstq+2*strideq] 1600 VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6 1601 lea dstq, [dstq+2*strideq] 1602 VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6 1603 lea dstq, [dstq+2*strideq] 1604 1605 ; at the end of the loop, m0 should still be zero 1606 ; use that to zero out block coefficients 1607 pxor m0, m0 1608 ZERO_BLOCK blockq, 32, 16, m0 1609 RET 1610%endif 1611 1612;--------------------------------------------------------------------------------------------- 1613; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 1614;--------------------------------------------------------------------------------------------- 1615 1616%macro VP9_IADST16_1D 2 ; src, pass 1617%assign %%str 16*%2 1618 mova m0, [%1+ 0*32] ; in0 1619 mova m1, [%1+15*32] ; in15 1620 mova m2, [%1+ 7*32] ; in7 1621 mova m3, [%1+ 8*32] ; in8 1622 1623 VP9_UNPACK_MULSUB_2D_4X 1, 0, 4, 5, 16364, 804 ; m1/4=t1[d], m0/5=t0[d] 1624 VP9_UNPACK_MULSUB_2D_4X 2, 3, 7, 6, 11003, 12140 ; m2/7=t9[d], m3/6=t8[d] 1625 SCRATCH 4, 8, tmpq+ 0*%%str 1626 VP9_RND_SH_SUMSUB_BA 3, 0, 6, 5, 4, [pd_8192] ; m3=t0[w], m0=t8[w] 1627 UNSCRATCH 4, 8, tmpq+ 0*%%str 1628 VP9_RND_SH_SUMSUB_BA 2, 1, 7, 4, 5, [pd_8192] ; m2=t1[w], m1=t9[w] 1629 1630 SCRATCH 0, 10, tmpq+ 0*%%str 1631 SCRATCH 1, 11, tmpq+15*%%str 1632 mova [tmpq+ 7*%%str], m2 1633 mova [tmpq+ 8*%%str], m3 1634 1635 mova m1, [%1+ 2*32] ; in2 1636 mova m0, [%1+13*32] ; in13 1637 mova m3, [%1+ 5*32] ; in5 1638 mova m2, [%1+10*32] ; in10 1639 1640 VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 15893, 3981 ; m0/6=t3[d], m1/7=t2[d] 1641 VP9_UNPACK_MULSUB_2D_4X 3, 2, 4, 5, 8423, 14053 ; m3/4=t11[d], m2/5=t10[d] 1642 SCRATCH 4, 12, tmpq+ 2*%%str 1643 VP9_RND_SH_SUMSUB_BA 2, 1, 5, 7, 4, [pd_8192] ; m2=t2[w], m1=t10[w] 1644 UNSCRATCH 4, 12, tmpq+ 2*%%str 1645 VP9_RND_SH_SUMSUB_BA 3, 0, 4, 6, 5, [pd_8192] ; m3=t3[w], m0=t11[w] 1646 1647 SCRATCH 0, 12, tmpq+ 2*%%str 1648 SCRATCH 1, 13, tmpq+13*%%str 1649 mova [tmpq+ 5*%%str], m2 1650 mova [tmpq+10*%%str], m3 1651 1652 mova m2, [%1+ 4*32] ; in4 1653 mova m3, [%1+11*32] ; in11 1654 mova m0, [%1+ 3*32] ; in3 1655 mova m1, [%1+12*32] ; in12 1656 1657 VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 14811, 7005 ; m3/7=t5[d], m2/6=t4[d] 1658 VP9_UNPACK_MULSUB_2D_4X 0, 1, 4, 5, 5520, 15426 ; m0/4=t13[d], m1/5=t12[d] 1659 SCRATCH 4, 9, tmpq+ 4*%%str 1660 VP9_RND_SH_SUMSUB_BA 1, 2, 5, 6, 4, [pd_8192] ; m1=t4[w], m2=t12[w] 1661 UNSCRATCH 4, 9, tmpq+ 4*%%str 1662 VP9_RND_SH_SUMSUB_BA 0, 3, 4, 7, 6, [pd_8192] ; m0=t5[w], m3=t13[w] 1663 1664 SCRATCH 0, 8, tmpq+ 4*%%str 1665 mova [tmpq+11*%%str], m1 ; t4:m1->r11 1666 UNSCRATCH 0, 10, tmpq+ 0*%%str 1667 UNSCRATCH 1, 11, tmpq+15*%%str 1668 1669 ; round 2 interleaved part 1 1670 VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 16069, 3196 ; m1/7=t8[d], m0/6=t9[d] 1671 VP9_UNPACK_MULSUB_2D_4X 3, 2, 5, 4, 3196, 16069 ; m3/5=t12[d], m2/4=t13[d] 1672 SCRATCH 4, 9, tmpq+ 3*%%str 1673 VP9_RND_SH_SUMSUB_BA 3, 1, 5, 7, 4, [pd_8192] ; m3=t8[w], m1=t12[w] 1674 UNSCRATCH 4, 9, tmpq+ 3*%%str 1675 VP9_RND_SH_SUMSUB_BA 2, 0, 4, 6, 5, [pd_8192] ; m2=t9[w], m0=t13[w] 1676 1677 SCRATCH 0, 10, tmpq+ 0*%%str 1678 SCRATCH 1, 11, tmpq+15*%%str 1679 SCRATCH 2, 14, tmpq+ 3*%%str 1680 SCRATCH 3, 15, tmpq+12*%%str 1681 1682 mova m2, [%1+ 6*32] ; in6 1683 mova m3, [%1+ 9*32] ; in9 1684 mova m0, [%1+ 1*32] ; in1 1685 mova m1, [%1+14*32] ; in14 1686 1687 VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 13160, 9760 ; m3/7=t7[d], m2/6=t6[d] 1688 VP9_UNPACK_MULSUB_2D_4X 0, 1, 4, 5, 2404, 16207 ; m0/4=t15[d], m1/5=t14[d] 1689 SCRATCH 4, 9, tmpq+ 6*%%str 1690 VP9_RND_SH_SUMSUB_BA 1, 2, 5, 6, 4, [pd_8192] ; m1=t6[w], m2=t14[w] 1691 UNSCRATCH 4, 9, tmpq+ 6*%%str 1692 VP9_RND_SH_SUMSUB_BA 0, 3, 4, 7, 6, [pd_8192] ; m0=t7[w], m3=t15[w] 1693 1694 ; r8=t0, r7=t1, r5=t2, r10=t3, r11=t4, m8|r4=t5, m1=t6, m0=t7 1695 ; m10|r0=t8, m11|r15=t9, m13|r13=t10, m12|r2=t11, m14|r3=t12, m15|r12=t13, m2=t14, m3=t15 1696 1697 UNSCRATCH 4, 12, tmpq+ 2*%%str 1698 UNSCRATCH 5, 13, tmpq+13*%%str 1699 SCRATCH 0, 12, tmpq+ 1*%%str 1700 SCRATCH 1, 13, tmpq+14*%%str 1701 1702 ; remainder of round 2 (rest of t8-15) 1703 VP9_UNPACK_MULSUB_2D_4X 5, 4, 6, 7, 9102, 13623 ; m5/6=t11[d], m4/7=t10[d] 1704 VP9_UNPACK_MULSUB_2D_4X 3, 2, 1, 0, 13623, 9102 ; m3/1=t14[d], m2/0=t15[d] 1705 SCRATCH 0, 9, tmpq+ 6*%%str 1706 VP9_RND_SH_SUMSUB_BA 3, 4, 1, 7, 0, [pd_8192] ; m3=t10[w], m4=t14[w] 1707 UNSCRATCH 0, 9, tmpq+ 6*%%str 1708 VP9_RND_SH_SUMSUB_BA 2, 5, 0, 6, 1, [pd_8192] ; m2=t11[w], m5=t15[w] 1709 1710 ; m15|r12=t8, m14|r3=t9, m3=t10, m2=t11, m11|r15=t12, m10|r0=t13, m4=t14, m5=t15 1711 1712 UNSCRATCH 6, 14, tmpq+ 3*%%str 1713 UNSCRATCH 7, 15, tmpq+12*%%str 1714 1715 SUMSUB_BA w, 3, 7, 1 1716 PSIGNW m3, [pw_m1] ; m3=out1[w], m7=t10[w] 1717 SUMSUB_BA w, 2, 6, 1 ; m2=out14[w], m6=t11[w] 1718 1719 ; unfortunately, the code below overflows in some cases, e.g. 1720 ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8.webm 1721%if 0; cpuflag(ssse3) 1722 SUMSUB_BA w, 7, 6, 1 1723 pmulhrsw m7, [pw_11585x2] ; m7=out6[w] 1724 pmulhrsw m6, [pw_11585x2] ; m6=out9[w] 1725%else 1726 VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, 11585, [pd_8192], 1, 0 1727%endif 1728 1729 mova [tmpq+ 3*%%str], m6 1730 mova [tmpq+ 6*%%str], m7 1731 UNSCRATCH 6, 10, tmpq+ 0*%%str 1732 UNSCRATCH 7, 11, tmpq+15*%%str 1733 mova [tmpq+13*%%str], m2 1734 SCRATCH 3, 11, tmpq+ 9*%%str 1735 1736 VP9_UNPACK_MULSUB_2D_4X 7, 6, 2, 3, 15137, 6270 ; m6/3=t13[d], m7/2=t12[d] 1737 VP9_UNPACK_MULSUB_2D_4X 5, 4, 1, 0, 6270, 15137 ; m5/1=t14[d], m4/0=t15[d] 1738 SCRATCH 0, 9, tmpq+ 2*%%str 1739 VP9_RND_SH_SUMSUB_BA 5, 6, 1, 3, 0, [pd_8192] ; m5=out2[w], m6=t14[w] 1740 UNSCRATCH 0, 9, tmpq+ 2*%%str 1741 VP9_RND_SH_SUMSUB_BA 4, 7, 0, 2, 1, [pd_8192] 1742 PSIGNW m4, [pw_m1] ; m4=out13[w], m7=t15[w] 1743 1744 ; unfortunately, the code below overflows in some cases 1745%if 0; cpuflag(ssse3) 1746 SUMSUB_BA w, 7, 6, 1 1747 pmulhrsw m7, [pw_m11585x2] ; m7=out5[w] 1748 pmulhrsw m6, [pw_11585x2] ; m6=out10[w] 1749%else 1750 PSIGNW m7, [pw_m1] 1751 VP9_UNPACK_MULSUB_2W_4X 7, 6, 11585, 11585, [pd_8192], 1, 0 1752%endif 1753 1754 ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, m6=out10, m4=out13, r2=out14 1755 1756 mova m2, [tmpq+ 8*%%str] 1757 mova m3, [tmpq+ 7*%%str] 1758 mova m1, [tmpq+11*%%str] 1759 mova [tmpq+ 7*%%str], m6 1760 mova [tmpq+11*%%str], m4 1761 mova m4, [tmpq+ 5*%%str] 1762 SCRATCH 5, 14, tmpq+ 5*%%str 1763 SCRATCH 7, 15, tmpq+ 8*%%str 1764 UNSCRATCH 6, 8, tmpq+ 4*%%str 1765 UNSCRATCH 5, 12, tmpq+ 1*%%str 1766 UNSCRATCH 7, 13, tmpq+14*%%str 1767 1768 ; m2=t0, m3=t1, m9=t2, m0=t3, m1=t4, m8=t5, m13=t6, m12=t7 1769 ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14 1770 1771 SUMSUB_BA w, 1, 2, 0 ; m1=t0[w], m2=t4[w] 1772 mova m0, [tmpq+10*%%str] 1773 SCRATCH 1, 12, tmpq+ 1*%%str 1774 SUMSUB_BA w, 6, 3, 1 ; m8=t1[w], m3=t5[w] 1775 SCRATCH 6, 13, tmpq+ 4*%%str 1776 SUMSUB_BA w, 7, 4, 1 ; m13=t2[w], m9=t6[w] 1777 SCRATCH 7, 8, tmpq+10*%%str 1778 SUMSUB_BA w, 5, 0, 1 ; m12=t3[w], m0=t7[w] 1779 SCRATCH 5, 9, tmpq+14*%%str 1780 1781 VP9_UNPACK_MULSUB_2D_4X 2, 3, 7, 5, 15137, 6270 ; m2/6=t5[d], m3/10=t4[d] 1782 VP9_UNPACK_MULSUB_2D_4X 0, 4, 1, 6, 6270, 15137 ; m0/14=t6[d], m9/15=t7[d] 1783 SCRATCH 6, 10, tmpq+ 0*%%str 1784 VP9_RND_SH_SUMSUB_BA 0, 3, 1, 5, 6, [pd_8192] 1785 UNSCRATCH 6, 10, tmpq+ 0*%%str 1786 PSIGNW m0, [pw_m1] ; m0=out3[w], m3=t6[w] 1787 VP9_RND_SH_SUMSUB_BA 4, 2, 6, 7, 5, [pd_8192] ; m9=out12[w], m2=t7[w] 1788 1789 UNSCRATCH 1, 8, tmpq+10*%%str 1790 UNSCRATCH 5, 9, tmpq+14*%%str 1791 UNSCRATCH 6, 12, tmpq+ 1*%%str 1792 UNSCRATCH 7, 13, tmpq+ 4*%%str 1793 SCRATCH 4, 9, tmpq+14*%%str 1794 1795 SUMSUB_BA w, 1, 6, 4 ; m13=out0[w], m1=t2[w] 1796 SUMSUB_BA w, 5, 7, 4 1797 PSIGNW m5, [pw_m1] ; m12=out15[w], m8=t3[w] 1798 1799 ; unfortunately, the code below overflows in some cases, e.g. 1800 ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm 1801%if 0 ; cpuflag(ssse3) 1802 SUMSUB_BA w, 7, 6, 4 1803 pmulhrsw m7, [pw_m11585x2] ; m8=out7[w] 1804 pmulhrsw m6, [pw_11585x2] ; m1=out8[w] 1805 SWAP 6, 7 1806 SUMSUB_BA w, 3, 2, 4 1807 pmulhrsw m3, [pw_11585x2] ; m3=out4[w] 1808 pmulhrsw m2, [pw_11585x2] ; m2=out11[w] 1809%else 1810 SCRATCH 5, 8, tmpq+10*%%str 1811 VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, m11585, [pd_8192], 5, 4 1812 VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 4 1813 UNSCRATCH 5, 8, tmpq+10*%%str 1814%endif 1815 1816 ; m13=out0, m0=out3, m3=out4, m8=out7, m1=out8, m2=out11, m9=out12, m12=out15 1817 ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14 1818 1819%if %2 == 1 1820%if ARCH_X86_64 1821 mova m13, [tmpq+ 6*%%str] 1822 TRANSPOSE8x8W 1, 11, 14, 0, 3, 15, 13, 6, 10 1823 mova [tmpq+ 0*16], m1 1824 mova [tmpq+ 2*16], m11 1825 mova [tmpq+ 4*16], m14 1826 mova [tmpq+ 6*16], m0 1827 mova m1, [tmpq+ 3*%%str] 1828 mova m11, [tmpq+ 7*%%str] 1829 mova m14, [tmpq+11*%%str] 1830 mova m0, [tmpq+13*%%str] 1831 mova [tmpq+ 8*16], m3 1832 mova [tmpq+10*16], m15 1833 mova [tmpq+12*16], m13 1834 mova [tmpq+14*16], m6 1835 1836 TRANSPOSE8x8W 7, 1, 11, 2, 9, 14, 0, 5, 10 1837 mova [tmpq+ 1*16], m7 1838 mova [tmpq+ 3*16], m1 1839 mova [tmpq+ 5*16], m11 1840 mova [tmpq+ 7*16], m2 1841 mova [tmpq+ 9*16], m9 1842 mova [tmpq+11*16], m14 1843 mova [tmpq+13*16], m0 1844 mova [tmpq+15*16], m5 1845%else 1846 mova [tmpq+12*%%str], m2 1847 mova [tmpq+ 1*%%str], m5 1848 mova [tmpq+15*%%str], m7 1849 mova m2, [tmpq+ 9*%%str] 1850 mova m5, [tmpq+ 5*%%str] 1851 mova m7, [tmpq+ 8*%%str] 1852 TRANSPOSE8x8W 1, 2, 5, 0, 3, 7, 4, 6, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1 1853 mova [tmpq+ 0*16], m1 1854 mova [tmpq+ 2*16], m2 1855 mova [tmpq+ 4*16], m5 1856 mova [tmpq+ 6*16], m0 1857 mova [tmpq+10*16], m7 1858 mova m3, [tmpq+12*%%str] 1859 mova [tmpq+12*16], m4 1860 mova m4, [tmpq+14*%%str] 1861 mova [tmpq+14*16], m6 1862 1863 mova m0, [tmpq+15*%%str] 1864 mova m1, [tmpq+ 3*%%str] 1865 mova m2, [tmpq+ 7*%%str] 1866 mova m5, [tmpq+11*%%str] 1867 mova m7, [tmpq+ 1*%%str] 1868 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+13*%%str], [tmpq+ 9*%%str], 1 1869 mova [tmpq+ 1*16], m0 1870 mova [tmpq+ 3*16], m1 1871 mova [tmpq+ 5*16], m2 1872 mova [tmpq+ 7*16], m3 1873 mova [tmpq+11*16], m5 1874 mova [tmpq+13*16], m6 1875 mova [tmpq+15*16], m7 1876%endif 1877%else 1878 pxor m4, m4 1879 1880%if cpuflag(ssse3) 1881%define ROUND_REG [pw_512] 1882%else 1883%define ROUND_REG [pw_32] 1884%endif 1885 1886%if ARCH_X86_64 1887 mova m12, [tmpq+ 6*%%str] 1888 VP9_IDCT8_WRITEx2 1, 11, 10, 8, 4, ROUND_REG, 6 1889 lea dstq, [dstq+strideq*2] 1890 VP9_IDCT8_WRITEx2 14, 0, 10, 8, 4, ROUND_REG, 6 1891 lea dstq, [dstq+strideq*2] 1892 VP9_IDCT8_WRITEx2 3, 15, 10, 8, 4, ROUND_REG, 6 1893 lea dstq, [dstq+strideq*2] 1894 VP9_IDCT8_WRITEx2 12, 6, 10, 8, 4, ROUND_REG, 6 1895 lea dstq, [dstq+strideq*2] 1896 1897 mova m1, [tmpq+ 3*%%str] 1898 mova m11, [tmpq+ 7*%%str] 1899 mova m14, [tmpq+11*%%str] 1900 mova m0, [tmpq+13*%%str] 1901 1902 VP9_IDCT8_WRITEx2 7, 1, 10, 8, 4, ROUND_REG, 6 1903 lea dstq, [dstq+strideq*2] 1904 VP9_IDCT8_WRITEx2 11, 2, 10, 8, 4, ROUND_REG, 6 1905 lea dstq, [dstq+strideq*2] 1906 VP9_IDCT8_WRITEx2 9, 14, 10, 8, 4, ROUND_REG, 6 1907 lea dstq, [dstq+strideq*2] 1908 VP9_IDCT8_WRITEx2 0, 5, 10, 8, 4, ROUND_REG, 6 1909%else 1910 mova [tmpq+ 0*%%str], m2 1911 mova [tmpq+ 1*%%str], m5 1912 mova [tmpq+ 2*%%str], m7 1913 mova m2, [tmpq+ 9*%%str] 1914 VP9_IDCT8_WRITEx2 1, 2, 5, 7, 4, ROUND_REG, 6 1915 lea dstq, [dstq+strideq*2] 1916 mova m5, [tmpq+ 5*%%str] 1917 VP9_IDCT8_WRITEx2 5, 0, 1, 2, 4, ROUND_REG, 6 1918 lea dstq, [dstq+strideq*2] 1919 mova m5, [tmpq+ 8*%%str] 1920 VP9_IDCT8_WRITEx2 3, 5, 1, 2, 4, ROUND_REG, 6 1921 lea dstq, [dstq+strideq*2] 1922 mova m5, [tmpq+ 6*%%str] 1923 VP9_IDCT8_WRITEx2 5, 6, 1, 2, 4, ROUND_REG, 6 1924 lea dstq, [dstq+strideq*2] 1925 1926 mova m0, [tmpq+ 2*%%str] 1927 mova m3, [tmpq+ 3*%%str] 1928 VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 1929 lea dstq, [dstq+strideq*2] 1930 mova m0, [tmpq+ 7*%%str] 1931 mova m3, [tmpq+ 0*%%str] 1932 VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 1933 lea dstq, [dstq+strideq*2] 1934 mova m0, [tmpq+14*%%str] 1935 mova m3, [tmpq+11*%%str] 1936 VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 1937 lea dstq, [dstq+strideq*2] 1938 mova m0, [tmpq+13*%%str] 1939 mova m3, [tmpq+ 1*%%str] 1940 VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 1941%endif 1942 1943 SWAP 0, 4 ; zero 1944%undef ROUND_REG 1945%endif 1946%endmacro 1947 1948%macro IADST16_FN 5 1949INIT_XMM %5 1950cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp 1951 mov cntd, 2 1952 mov tmpq, rsp 1953.loop1_full: 1954 VP9_%2_1D blockq, 1 1955 add blockq, 16 1956 add tmpq, 256 1957 dec cntd 1958 jg .loop1_full 1959 sub blockq, 32 1960 1961 mov cntd, 2 1962 mov tmpq, rsp 1963 mov dst_bakq, dstq 1964.loop2_full: 1965 VP9_%4_1D tmpq, 2 1966 lea dstq, [dst_bakq+8] 1967 add tmpq, 16 1968 dec cntd 1969 jg .loop2_full 1970 1971 ; at the end of the loop, m0 should still be zero 1972 ; use that to zero out block coefficients 1973 ZERO_BLOCK blockq, 32, 16, m0 1974 RET 1975%endmacro 1976 1977IADST16_FN idct, IDCT16, iadst, IADST16, sse2 1978IADST16_FN iadst, IADST16, idct, IDCT16, sse2 1979IADST16_FN iadst, IADST16, iadst, IADST16, sse2 1980IADST16_FN idct, IDCT16, iadst, IADST16, ssse3 1981IADST16_FN iadst, IADST16, idct, IDCT16, ssse3 1982IADST16_FN iadst, IADST16, iadst, IADST16, ssse3 1983IADST16_FN idct, IDCT16, iadst, IADST16, avx 1984IADST16_FN iadst, IADST16, idct, IDCT16, avx 1985IADST16_FN iadst, IADST16, iadst, IADST16, avx 1986 1987; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128] 1988; out: m[0-15] except m6, which is in [blockq+192] 1989; uses blockq as scratch space 1990%macro VP9_IADST16_YMM_1D 0 1991 mova [blockq+ 32], m3 1992 mova [blockq+ 64], m7 1993 mova [blockq+ 96], m8 1994 1995 ; first half of round 1 1996 VP9_UNPACK_MULSUB_2D_4X 9, 6, 0, 3, 13160, 9760 ; m9/x=t7[d], m6/x=t6[d] 1997 VP9_UNPACK_MULSUB_2D_4X 1, 14, 4, 7, 2404, 16207 ; m1/x=t15[d], m14/x=t14[d] 1998 VP9_RND_SH_SUMSUB_BA 14, 6, 7, 3, 8, [pd_8192] ; m14=t6[w], m6=t14[w] 1999 VP9_RND_SH_SUMSUB_BA 1, 9, 4, 0, 8, [pd_8192] ; m1=t7[w], m9=t15[w] 2000 2001 VP9_UNPACK_MULSUB_2D_4X 13, 2, 4, 7, 15893, 3981 ; m13/x=t3[d], m2/x=t2[d] 2002 VP9_UNPACK_MULSUB_2D_4X 5, 10, 0, 3, 8423, 14053 ; m5/x=t11[d], m10/x=t10[d] 2003 VP9_RND_SH_SUMSUB_BA 10, 2, 3, 7, 8, [pd_8192] ; m10=t2[w], m2=t10[w] 2004 VP9_RND_SH_SUMSUB_BA 5, 13, 0, 4, 8, [pd_8192] ; m5=t3[w], m13=t11[w] 2005 2006 ; half of round 2 t8-15 2007 VP9_UNPACK_MULSUB_2D_4X 2, 13, 4, 7, 9102, 13623 ; m2/x=t11[d], m13/x=t10[d] 2008 VP9_UNPACK_MULSUB_2D_4X 9, 6, 3, 0, 13623, 9102 ; m9/x=t14[d], m6/x=t15[d] 2009 VP9_RND_SH_SUMSUB_BA 9, 13, 3, 7, 8, [pd_8192] ; m9=t10[w], m13=t14[w] 2010 VP9_RND_SH_SUMSUB_BA 6, 2, 0, 4, 8, [pd_8192] ; m6=t11[w], m2=t15[w] 2011 2012 SUMSUB_BA w, 14, 10, 8 ; m14=t2, m10=t6 2013 SUMSUB_BA w, 1, 5, 8 ; m1=t3, m5=t7 2014 2015 mova m0, [blockq+ 0] 2016 mova m4, [blockq+128] 2017 mova m3, [blockq+ 32] 2018 mova m7, [blockq+ 64] 2019 mova m8, [blockq+ 96] 2020 mova [blockq+ 0], m1 2021 mova [blockq+128], m14 2022 mova [blockq+ 32], m6 2023 mova [blockq+ 64], m9 2024 mova [blockq+ 96], m10 2025 2026 ; second half of round 1 2027 VP9_UNPACK_MULSUB_2D_4X 15, 0, 1, 9, 16364, 804 ; m15/x=t1[d], m0/x=t0[d] 2028 VP9_UNPACK_MULSUB_2D_4X 7, 8, 10, 6, 11003, 12140 ; m7/x=t9[d], m8/x=t8[d] 2029 VP9_RND_SH_SUMSUB_BA 8, 0, 6, 9, 14, [pd_8192] ; m8=t0[w], m0=t8[w] 2030 VP9_RND_SH_SUMSUB_BA 7, 15, 10, 1, 14, [pd_8192] ; m7=t1[w], m15=t9[w] 2031 2032 VP9_UNPACK_MULSUB_2D_4X 11, 4, 10, 6, 14811, 7005 ; m11/x=t5[d], m4/x=t4[d] 2033 VP9_UNPACK_MULSUB_2D_4X 3, 12, 1, 9, 5520, 15426 ; m3/x=t13[d], m12/x=t12[d] 2034 VP9_RND_SH_SUMSUB_BA 12, 4, 9, 6, 14, [pd_8192] ; m12=t4[w], m4=t12[w] 2035 VP9_RND_SH_SUMSUB_BA 3, 11, 1, 10, 14, [pd_8192] ; m3=t5[w], m11=t13[w] 2036 2037 ; second half of round 2 t8-15 2038 VP9_UNPACK_MULSUB_2D_4X 0, 15, 6, 10, 16069, 3196 ; m15/x=t8[d], m0/x=t9[d] 2039 VP9_UNPACK_MULSUB_2D_4X 11, 4, 9, 1, 3196, 16069 ; m11/x=t12[d], m4/x=t13[d] 2040 VP9_RND_SH_SUMSUB_BA 11, 15, 9, 10, 14, [pd_8192] ; m11=t8[w], m15=t12[w] 2041 VP9_RND_SH_SUMSUB_BA 4, 0, 1, 6, 14, [pd_8192] ; m4=t9[w], m0=t13[w] 2042 2043 SUMSUB_BA w, 12, 8, 14 ; m12=t0, m8=t4 2044 SUMSUB_BA w, 3, 7, 14 ; m3=t1, m7=t5 2045 2046 mova m10, [blockq+ 96] 2047 mova [blockq+ 96], m12 2048 2049 ; round 3 2050 VP9_UNPACK_MULSUB_2D_4X 15, 0, 9, 12, 15137, 6270 ; m15/x=t13[d], m0/x=t12[d] 2051 VP9_UNPACK_MULSUB_2D_4X 2, 13, 1, 6, 6270, 15137 ; m2/x=t14[d], m13/x=t15[d] 2052 VP9_RND_SH_SUMSUB_BA 2, 0, 1, 12, 14, [pd_8192] ; m2=out2[w], m0=t14a[w] 2053 VP9_RND_SH_SUMSUB_BA 13, 15, 6, 9, 14, [pd_8192] 2054 PSIGNW m13, [pw_m1] ; m13=out13[w], m15=t15a[w] 2055 2056 VP9_UNPACK_MULSUB_2D_4X 8, 7, 12, 9, 15137, 6270 ; m8/x=t5[d], m7/x=t4[d] 2057 VP9_UNPACK_MULSUB_2D_4X 5, 10, 1, 6, 6270, 15137 ; m5/x=t6[d], m10/x=t7[d] 2058 VP9_RND_SH_SUMSUB_BA 5, 7, 1, 9, 14, [pd_8192] 2059 PSIGNW m5, [pw_m1] ; m5=out3[w], m7=t6[w] 2060 VP9_RND_SH_SUMSUB_BA 10, 8, 6, 12, 14, [pd_8192] ; m10=out12[w], m8=t7[w] 2061 2062 mova m1, [blockq+ 0] 2063 mova m14, [blockq+128] 2064 mova m6, [blockq+ 32] 2065 mova m9, [blockq+ 64] 2066 mova m12, [blockq+ 96] 2067 mova [blockq+ 0], m10 2068 mova [blockq+128], m5 2069 2070 SUMSUB_BA w, 14, 12, 5 ; m14=out0, m12=t2a 2071 SUMSUB_BA w, 1, 3, 5 2072 PSIGNW m1, [pw_m1] ; m1=out15, m3=t3a 2073 2074 SUMSUB_BA w, 9, 11, 5 2075 PSIGNW m9, [pw_m1] ; m9=out1, m11=t10 2076 SUMSUB_BA w, 6, 4, 5 ; m6=out14, m4=t11 2077 2078 VP9_UNPACK_MULSUB_2W_4X 4, 11, 11585, 11585, [pd_8192], 5, 10 ; m4=out9, m11=out6 2079 mova m5, [blockq+128] 2080 mova [blockq+192], m11 2081 PSIGNW m15, [pw_m1] 2082 VP9_UNPACK_MULSUB_2W_4X 15, 0, 11585, 11585, [pd_8192], 10, 11 ; m15=out5, m0=out10 2083 2084 PSIGNW m3, [pw_m1] 2085 VP9_UNPACK_MULSUB_2W_4X 3, 12, 11585, 11585, [pd_8192], 10, 11 ; m3=out7,m12=out8 2086 VP9_UNPACK_MULSUB_2W_4X 8, 7, 11585, 11585, [pd_8192], 10, 11 ; m8=out11,m7=out4 2087 2088 mova m10, [blockq+ 0] 2089 2090 SWAP 0, 14, 6, 11, 8, 12, 10 2091 SWAP 1, 9, 15, 4, 7, 3, 5 2092 SWAP 5, 9, 15 2093%endmacro 2094 2095%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 2096%macro IADST16_YMM_FN 4 2097INIT_YMM avx2 2098cglobal vp9_%1_%3_16x16_add, 4, 4, 16, dst, stride, block, eob 2099 mova m1, [blockq+ 32] 2100 mova m2, [blockq+ 64] 2101 mova m3, [blockq+ 96] 2102 mova m5, [blockq+160] 2103 mova m6, [blockq+192] 2104 mova m7, [blockq+224] 2105 mova m8, [blockq+256] 2106 mova m9, [blockq+288] 2107 mova m10, [blockq+320] 2108 mova m11, [blockq+352] 2109 mova m12, [blockq+384] 2110 mova m13, [blockq+416] 2111 mova m14, [blockq+448] 2112 mova m15, [blockq+480] 2113 2114 VP9_%2_YMM_1D 2115 TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ 2116 [blockq+192], [blockq+128], 1 2117 mova [blockq+ 0], m0 2118 VP9_%4_YMM_1D 2119 2120 mova [blockq+224], m7 2121 2122 ; store 2123 VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6 2124 lea dstq, [dstq+2*strideq] 2125 VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6 2126 lea dstq, [dstq+2*strideq] 2127 VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6 2128 lea dstq, [dstq+2*strideq] 2129 mova m6, [blockq+192] 2130 mova m7, [blockq+224] 2131 VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6 2132 lea dstq, [dstq+2*strideq] 2133 VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6 2134 lea dstq, [dstq+2*strideq] 2135 VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6 2136 lea dstq, [dstq+2*strideq] 2137 VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6 2138 lea dstq, [dstq+2*strideq] 2139 VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6 2140 lea dstq, [dstq+2*strideq] 2141 2142 ; at the end of the loop, m0 should still be zero 2143 ; use that to zero out block coefficients 2144 pxor m0, m0 2145 ZERO_BLOCK blockq, 32, 16, m0 2146 RET 2147%endmacro 2148 2149IADST16_YMM_FN idct, IDCT16, iadst, IADST16 2150IADST16_YMM_FN iadst, IADST16, idct, IDCT16 2151IADST16_YMM_FN iadst, IADST16, iadst, IADST16 2152%endif 2153 2154;--------------------------------------------------------------------------------------------- 2155; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 2156;--------------------------------------------------------------------------------------------- 2157 2158%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc 2159%if %2 == 1 2160%assign %%str mmsize 2161%else 2162%assign %%str 64 2163%endif 2164 2165 ; first do t0-15, this can be done identical to idct16x16 2166 VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1 2167 2168 ; store everything on stack to make space available for t16-31 2169 ; we store interleaved with the output of the second half (t16-31) 2170 ; so we don't need to allocate extra stack space 2171 mova [tmpq+ 0*%%str], m0 ; t0 2172 mova [tmpq+ 4*%%str], m1 ; t1 2173 mova [tmpq+ 8*%%str], m2 ; t2 2174 mova [tmpq+12*%%str], m3 ; t3 2175 mova [tmpq+16*%%str], m4 ; t4 2176 mova [tmpq+20*%%str], m5 ; t5 2177%if ARCH_X86_64 2178 mova [tmpq+22*%%str], m10 ; t10 2179 mova [tmpq+18*%%str], m11 ; t11 2180 mova [tmpq+14*%%str], m12 ; t12 2181 mova [tmpq+10*%%str], m13 ; t13 2182 mova [tmpq+ 6*%%str], m14 ; t14 2183 mova [tmpq+ 2*%%str], m15 ; t15 2184%endif 2185 2186 mova m0, [tmpq+ 30*%%str] 2187 UNSCRATCH 1, 6, tmpq+26*%%str 2188 UNSCRATCH 2, 8, tmpq+24*%%str 2189 UNSCRATCH 3, 9, tmpq+28*%%str 2190 SUMSUB_BA w, 1, 3, 4 ; t6, t9 2191 SUMSUB_BA w, 0, 2, 4 ; t7, t8 2192 2193 mova [tmpq+24*%%str], m1 ; t6 2194 mova [tmpq+28*%%str], m0 ; t7 2195 mova [tmpq+30*%%str], m2 ; t8 2196 mova [tmpq+26*%%str], m3 ; t9 2197 2198 ; then, secondly, do t16-31 2199%if %3 <= 8 2200 mova m4, [%1+ 1*64] 2201 mova m7, [%1+ 7*64] 2202 2203 pmulhrsw m1, m4, [pw_16364x2] ;t31 2204 pmulhrsw m4, [pw_804x2] ;t16 2205 2206 VP9_UNPACK_MULSUB_2W_4X 5, 0, 1, 4, 16069, 3196, [pd_8192], 6, 2 ; t17, t30 2207 2208 pmulhrsw m3, m7, [pw_m5520x2] ;t19 2209 pmulhrsw m7, [pw_15426x2] ;t28 2210 2211 SCRATCH 4, 13, tmpq+ 1*%%str 2212 SCRATCH 5, 12, tmpq+15*%%str 2213 2214 VP9_UNPACK_MULSUB_2W_4X 2, 6, 7, 3, 3196, m16069, [pd_8192], 4, 5 ; t18, t29 2215%else 2216 mova m0, [%1+ 1*64] 2217 mova m1, [%1+15*64] 2218%if %3 <= 16 2219 pmulhrsw m5, m0, [pw_16364x2] 2220 pmulhrsw m0, [pw_804x2] 2221 pmulhrsw m4, m1, [pw_m11003x2] 2222 pmulhrsw m1, [pw_12140x2] 2223%else 2224 mova m4, [%1+17*64] 2225 mova m5, [%1+31*64] 2226 2227 VP9_UNPACK_MULSUB_2W_4X 0, 5, 16364, 804, [pd_8192], 2, 3 ; t16, t31 2228 VP9_UNPACK_MULSUB_2W_4X 4, 1, 11003, 12140, [pd_8192], 2, 3 ; t17, t30 2229%endif 2230 SUMSUB_BA w, 4, 0, 2 2231 SUMSUB_BA w, 1, 5, 2 2232 2233 VP9_UNPACK_MULSUB_2W_4X 5, 0, 16069, 3196, [pd_8192], 2, 3 ; t17, t30 2234 2235 SCRATCH 4, 13, tmpq+ 1*%%str 2236 SCRATCH 5, 12, tmpq+15*%%str 2237 2238 mova m2, [%1+ 7*64] 2239 mova m3, [%1+ 9*64] 2240%if %3 <= 16 2241 pmulhrsw m7, m3, [pw_14811x2] 2242 pmulhrsw m3, [pw_7005x2] 2243 pmulhrsw m6, m2, [pw_m5520x2] 2244 pmulhrsw m2, [pw_15426x2] 2245%else 2246 mova m7, [%1+23*64] 2247 mova m6, [%1+25*64] 2248 2249 VP9_UNPACK_MULSUB_2W_4X 3, 7, 14811, 7005, [pd_8192], 4, 5 ; t18, t29 2250 VP9_UNPACK_MULSUB_2W_4X 6, 2, 5520, 15426, [pd_8192], 4, 5 ; t19, t28 2251%endif 2252 SUMSUB_BA w, 3, 6, 4 2253 SUMSUB_BA w, 7, 2, 4 2254 2255 VP9_UNPACK_MULSUB_2W_4X 2, 6, 3196, m16069, [pd_8192], 4, 5 ; t18, t29 2256%endif 2257 2258 UNSCRATCH 5, 12, tmpq+15*%%str 2259 SUMSUB_BA w, 6, 0, 4 2260 mova [tmpq+25*%%str], m6 ; t19 2261 UNSCRATCH 4, 13, tmpq+ 1*%%str 2262 SUMSUB_BA w, 7, 1, 6 2263 SUMSUB_BA w, 3, 4, 6 2264 mova [tmpq+23*%%str], m3 ; t16 2265 SUMSUB_BA w, 2, 5, 6 2266 2267 VP9_UNPACK_MULSUB_2W_4X 0, 5, 15137, 6270, [pd_8192], 6, 3 ; t18, t29 2268 VP9_UNPACK_MULSUB_2W_4X 1, 4, 15137, 6270, [pd_8192], 6, 3 ; t19, t28 2269 2270 SCRATCH 0, 10, tmpq+ 1*%%str 2271 SCRATCH 1, 11, tmpq+ 7*%%str 2272 SCRATCH 2, 9, tmpq+ 9*%%str 2273 SCRATCH 4, 14, tmpq+15*%%str 2274 SCRATCH 5, 15, tmpq+17*%%str 2275 SCRATCH 7, 13, tmpq+31*%%str 2276 2277%if %3 <= 8 2278 mova m0, [%1+ 5*64] 2279 mova m3, [%1+ 3*64] 2280 2281 pmulhrsw m5, m0, [pw_15893x2] ;t27 2282 pmulhrsw m0, [pw_3981x2] ;t20 2283 2284 VP9_UNPACK_MULSUB_2W_4X 1, 4, 5, 0, 9102, 13623, [pd_8192], 7, 2 ; t21, t26 2285 2286 pmulhrsw m6, m3, [pw_m2404x2] ;t23 2287 pmulhrsw m3, [pw_16207x2] ;t24 2288 2289 SCRATCH 5, 8, tmpq+ 5*%%str 2290 SCRATCH 4, 12, tmpq+11*%%str 2291 2292 VP9_UNPACK_MULSUB_2W_4X 7, 2, 3, 6, 13623, m9102, [pd_8192], 4, 5 ; t22, t25 2293%else 2294 mova m4, [%1+ 5*64] 2295 mova m5, [%1+11*64] 2296%if %3 <= 16 2297 pmulhrsw m1, m4, [pw_15893x2] 2298 pmulhrsw m4, [pw_3981x2] 2299 pmulhrsw m0, m5, [pw_m8423x2] 2300 pmulhrsw m5, [pw_14053x2] 2301%else 2302 mova m0, [%1+21*64] 2303 mova m1, [%1+27*64] 2304 2305 VP9_UNPACK_MULSUB_2W_4X 4, 1, 15893, 3981, [pd_8192], 2, 3 ; t20, t27 2306 VP9_UNPACK_MULSUB_2W_4X 0, 5, 8423, 14053, [pd_8192], 2, 3 ; t21, t26 2307%endif 2308 SUMSUB_BA w, 0, 4, 2 2309 SUMSUB_BA w, 5, 1, 2 2310 2311 VP9_UNPACK_MULSUB_2W_4X 1, 4, 9102, 13623, [pd_8192], 2, 3 ; t21, t26 2312 2313 SCRATCH 5, 8, tmpq+ 5*%%str 2314 SCRATCH 4, 12, tmpq+11*%%str 2315 2316 mova m7, [%1+ 3*64] 2317 mova m6, [%1+13*64] 2318%if %3 <= 16 2319 pmulhrsw m3, m6, [pw_13160x2] 2320 pmulhrsw m6, [pw_9760x2] 2321 pmulhrsw m2, m7, [pw_m2404x2] 2322 pmulhrsw m7, [pw_16207x2] 2323%else 2324 mova m2, [%1+29*64] 2325 mova m3, [%1+19*64] 2326 VP9_UNPACK_MULSUB_2W_4X 6, 3, 13160, 9760, [pd_8192], 4, 5 ; t22, t25 2327 VP9_UNPACK_MULSUB_2W_4X 2, 7, 2404, 16207, [pd_8192], 4, 5 ; t23, t24 2328%endif 2329 SUMSUB_BA w, 6, 2, 4 2330 SUMSUB_BA w, 3, 7, 4 2331 2332 VP9_UNPACK_MULSUB_2W_4X 7, 2, 13623, m9102, [pd_8192], 4, 5 ; t22, t25 2333%endif 2334 2335 ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23, 2336 ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31 2337 2338 UNSCRATCH 4, 12, tmpq+11*%%str 2339 SUMSUB_BA w, 0, 6, 5 2340 SUMSUB_BA w, 4, 2, 5 2341 UNSCRATCH 5, 8, tmpq+ 5*%%str 2342 SCRATCH 4, 8, tmpq+11*%%str 2343 SUMSUB_BA w, 1, 7, 4 2344 SUMSUB_BA w, 5, 3, 4 2345 SCRATCH 5, 12, tmpq+ 5*%%str 2346 2347 VP9_UNPACK_MULSUB_2W_4X 3, 6, 6270, m15137, [pd_8192], 4, 5 ; t20, t27 2348 VP9_UNPACK_MULSUB_2W_4X 2, 7, 6270, m15137, [pd_8192], 4, 5 ; t21, t26 2349 2350 ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23, 2351 ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31 2352 2353 UNSCRATCH 5, 9, tmpq+ 9*%%str 2354 mova m4, [tmpq+23*%%str] ; t16 2355%if ARCH_X86_64 2356 SUMSUB_BA w, 1, 5, 9 2357 SUMSUB_BA w, 0, 4, 9 2358%else 2359 SUMSUB_BADC w, 1, 5, 0, 4 2360%endif 2361 mova [tmpq+29*%%str], m1 ; t17 2362 mova [tmpq+21*%%str], m0 ; t16 2363 UNSCRATCH 0, 10, tmpq+ 1*%%str 2364 UNSCRATCH 1, 11, tmpq+ 7*%%str 2365%if ARCH_X86_64 2366 SUMSUB_BA w, 2, 0, 9 2367 SUMSUB_BA w, 3, 1, 9 2368%else 2369 SUMSUB_BADC w, 2, 0, 3, 1 2370%endif 2371 mova [tmpq+ 9*%%str], m2 ; t18 2372 mova [tmpq+13*%%str], m3 ; t19 2373 SCRATCH 0, 10, tmpq+23*%%str 2374 SCRATCH 1, 11, tmpq+27*%%str 2375 2376 UNSCRATCH 2, 14, tmpq+15*%%str 2377 UNSCRATCH 3, 15, tmpq+17*%%str 2378 SUMSUB_BA w, 6, 2, 0 2379 SUMSUB_BA w, 7, 3, 0 2380 SCRATCH 6, 14, tmpq+ 3*%%str 2381 SCRATCH 7, 15, tmpq+ 7*%%str 2382 2383 UNSCRATCH 0, 8, tmpq+11*%%str 2384 mova m1, [tmpq+25*%%str] ; t19 2385 UNSCRATCH 6, 12, tmpq+ 5*%%str 2386 UNSCRATCH 7, 13, tmpq+31*%%str 2387%if ARCH_X86_64 2388 SUMSUB_BA w, 0, 1, 9 2389 SUMSUB_BA w, 6, 7, 9 2390%else 2391 SUMSUB_BADC w, 0, 1, 6, 7 2392%endif 2393 2394 ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23, 2395 ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31 2396 2397%if 0; cpuflag(ssse3) 2398%if ARCH_X86_64 2399 SUMSUB_BA w, 4, 7, 8 2400 SUMSUB_BA w, 5, 1, 8 2401%else 2402 SUMSUB_BADC w, 4, 7, 5, 1 2403%endif 2404 2405 pmulhrsw m7, [pw_11585x2] 2406 pmulhrsw m4, [pw_11585x2] 2407 pmulhrsw m1, [pw_11585x2] 2408 pmulhrsw m5, [pw_11585x2] 2409 2410 mova [tmpq+ 5*%%str], m7 ; t23 2411 SCRATCH 1, 13, tmpq+25*%%str 2412 UNSCRATCH 7, 10, tmpq+23*%%str 2413 UNSCRATCH 1, 11, tmpq+27*%%str 2414 2415%if ARCH_X86_64 2416 SUMSUB_BA w, 7, 3, 10 2417 SUMSUB_BA w, 1, 2, 10 2418%else 2419 SUMSUB_BADC w, 7, 3, 1, 2 2420%endif 2421 2422 pmulhrsw m3, [pw_11585x2] 2423 pmulhrsw m7, [pw_11585x2] 2424 pmulhrsw m2, [pw_11585x2] 2425 pmulhrsw m1, [pw_11585x2] 2426%else 2427 SCRATCH 0, 8, tmpq+15*%%str 2428 SCRATCH 6, 9, tmpq+17*%%str 2429 VP9_UNPACK_MULSUB_2W_4X 7, 4, 11585, 11585, [pd_8192], 0, 6 2430 mova [tmpq+ 5*%%str], m7 ; t23 2431 UNSCRATCH 7, 10, tmpq+23*%%str 2432 VP9_UNPACK_MULSUB_2W_4X 1, 5, 11585, 11585, [pd_8192], 0, 6 2433 SCRATCH 1, 13, tmpq+25*%%str 2434 UNSCRATCH 1, 11, tmpq+27*%%str 2435 VP9_UNPACK_MULSUB_2W_4X 3, 7, 11585, 11585, [pd_8192], 0, 6 2436 VP9_UNPACK_MULSUB_2W_4X 2, 1, 11585, 11585, [pd_8192], 0, 6 2437 UNSCRATCH 0, 8, tmpq+15*%%str 2438 UNSCRATCH 6, 9, tmpq+17*%%str 2439%endif 2440 2441 ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23, 2442 ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31 2443 2444 ; then do final pass to sumsub+store the two halves 2445%if %2 == 1 2446 mova [tmpq+17*%%str], m2 ; t20 2447 mova [tmpq+ 1*%%str], m3 ; t21 2448%if ARCH_X86_64 2449 mova [tmpq+25*%%str], m13 ; t22 2450 2451 mova m8, [tmpq+ 0*%%str] ; t0 2452 mova m9, [tmpq+ 4*%%str] ; t1 2453 mova m12, [tmpq+ 8*%%str] ; t2 2454 mova m11, [tmpq+12*%%str] ; t3 2455 mova m2, [tmpq+16*%%str] ; t4 2456 mova m3, [tmpq+20*%%str] ; t5 2457 mova m13, [tmpq+24*%%str] ; t6 2458 2459 SUMSUB_BA w, 6, 8, 10 2460 mova [tmpq+ 3*%%str], m8 ; t15 2461 SUMSUB_BA w, 0, 9, 8 2462 SUMSUB_BA w, 15, 12, 8 2463 SUMSUB_BA w, 14, 11, 8 2464 SUMSUB_BA w, 1, 2, 8 2465 SUMSUB_BA w, 7, 3, 8 2466 SUMSUB_BA w, 5, 13, 8 2467 mova m10, [tmpq+28*%%str] ; t7 2468 SUMSUB_BA w, 4, 10, 8 2469%if cpuflag(avx2) 2470 ; the "shitty" about this idct is that the final pass does the outermost 2471 ; interleave sumsubs (t0/31, t1/30, etc) but the tN for the 16x16 need 2472 ; to be sequential, which means I need to load/store half of the sumsub 2473 ; intermediates back to/from memory to get a 16x16 transpose going... 2474 ; This would be easier if we had more (e.g. 32) YMM regs here. 2475 mova [tmpq+ 7*%%str], m9 2476 mova [tmpq+11*%%str], m12 2477 mova [tmpq+15*%%str], m11 2478 mova [tmpq+19*%%str], m2 2479 mova [tmpq+23*%%str], m3 2480 mova [tmpq+27*%%str], m13 2481 mova [tmpq+31*%%str], m10 2482 mova [tmpq+12*%%str], m5 2483 2484 mova m13, [tmpq+30*%%str] ; t8 2485 mova m12, [tmpq+26*%%str] ; t9 2486 mova m11, [tmpq+22*%%str] ; t10 2487 mova m10, [tmpq+18*%%str] ; t11 2488 mova m9, [tmpq+17*%%str] ; t20 2489 mova m8, [tmpq+ 1*%%str] ; t21 2490 mova m3, [tmpq+25*%%str] ; t22 2491 mova m2, [tmpq+ 5*%%str] ; t23 2492 2493 SUMSUB_BA w, 9, 10, 5 2494 SUMSUB_BA w, 8, 11, 5 2495 SUMSUB_BA w, 3, 12, 5 2496 SUMSUB_BA w, 2, 13, 5 2497 mova [tmpq+ 1*%%str], m10 2498 mova [tmpq+ 5*%%str], m11 2499 mova [tmpq+17*%%str], m12 2500 mova [tmpq+25*%%str], m13 2501 2502 mova m13, [tmpq+14*%%str] ; t12 2503 mova m12, [tmpq+10*%%str] ; t13 2504 mova m11, [tmpq+ 9*%%str] ; t18 2505 mova m10, [tmpq+13*%%str] ; t19 2506 2507 SUMSUB_BA w, 11, 12, 5 2508 SUMSUB_BA w, 10, 13, 5 2509 mova [tmpq+ 9*%%str], m13 2510 mova [tmpq+13*%%str], m12 2511 mova [tmpq+10*%%str], m10 2512 mova [tmpq+14*%%str], m11 2513 2514 mova m13, [tmpq+ 6*%%str] ; t14 2515 mova m12, [tmpq+ 2*%%str] ; t15 2516 mova m11, [tmpq+21*%%str] ; t16 2517 mova m10, [tmpq+29*%%str] ; t17 2518 SUMSUB_BA w, 11, 12, 5 2519 SUMSUB_BA w, 10, 13, 5 2520 mova [tmpq+21*%%str], m12 2521 mova [tmpq+29*%%str], m13 2522 mova m12, [tmpq+10*%%str] 2523 mova m13, [tmpq+14*%%str] 2524 2525 TRANSPOSE16x16W 6, 0, 15, 14, 1, 7, 5, 4, \ 2526 2, 3, 8, 9, 12, 13, 10, 11, \ 2527 [tmpq+12*%%str], [tmpq+ 8*%%str], 1 2528 mova [tmpq+ 0*%%str], m6 2529 mova [tmpq+ 2*%%str], m0 2530 mova [tmpq+ 4*%%str], m15 2531 mova [tmpq+ 6*%%str], m14 2532 mova [tmpq+10*%%str], m7 2533 mova [tmpq+12*%%str], m5 2534 mova [tmpq+14*%%str], m4 2535 mova [tmpq+16*%%str], m2 2536 mova [tmpq+18*%%str], m3 2537 mova [tmpq+20*%%str], m8 2538 mova [tmpq+22*%%str], m9 2539 mova [tmpq+24*%%str], m12 2540 mova [tmpq+26*%%str], m13 2541 mova [tmpq+28*%%str], m10 2542 mova [tmpq+30*%%str], m11 2543 2544 mova m0, [tmpq+21*%%str] 2545 mova m1, [tmpq+29*%%str] 2546 mova m2, [tmpq+13*%%str] 2547 mova m3, [tmpq+ 9*%%str] 2548 mova m4, [tmpq+ 1*%%str] 2549 mova m5, [tmpq+ 5*%%str] 2550 mova m7, [tmpq+25*%%str] 2551 mova m8, [tmpq+31*%%str] 2552 mova m9, [tmpq+27*%%str] 2553 mova m10, [tmpq+23*%%str] 2554 mova m11, [tmpq+19*%%str] 2555 mova m12, [tmpq+15*%%str] 2556 mova m13, [tmpq+11*%%str] 2557 mova m14, [tmpq+ 7*%%str] 2558 mova m15, [tmpq+ 3*%%str] 2559 TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, \ 2560 8, 9, 10, 11, 12, 13, 14, 15, \ 2561 [tmpq+17*%%str], [tmpq+ 9*%%str], 1 2562 mova [tmpq+ 1*%%str], m0 2563 mova [tmpq+ 3*%%str], m1 2564 mova [tmpq+ 5*%%str], m2 2565 mova [tmpq+ 7*%%str], m3 2566 mova [tmpq+11*%%str], m5 2567 mova [tmpq+13*%%str], m6 2568 mova [tmpq+15*%%str], m7 2569 mova [tmpq+17*%%str], m8 2570 mova [tmpq+19*%%str], m9 2571 mova [tmpq+21*%%str], m10 2572 mova [tmpq+23*%%str], m11 2573 mova [tmpq+25*%%str], m12 2574 mova [tmpq+27*%%str], m13 2575 mova [tmpq+29*%%str], m14 2576 mova [tmpq+31*%%str], m15 2577%else ; !avx2 2578 TRANSPOSE8x8W 6, 0, 15, 14, 1, 7, 5, 4, 8 2579 mova [tmpq+ 0*%%str], m6 2580 mova [tmpq+ 4*%%str], m0 2581 mova [tmpq+ 8*%%str], m15 2582 mova [tmpq+12*%%str], m14 2583 mova [tmpq+16*%%str], m1 2584 mova [tmpq+20*%%str], m7 2585 mova [tmpq+24*%%str], m5 2586 mova [tmpq+28*%%str], m4 2587 2588 mova m8, [tmpq+ 3*%%str] ; t15 2589 TRANSPOSE8x8W 10, 13, 3, 2, 11, 12, 9, 8, 0 2590 mova [tmpq+ 3*%%str], m10 2591 mova [tmpq+ 7*%%str], m13 2592 mova [tmpq+11*%%str], m3 2593 mova [tmpq+15*%%str], m2 2594 mova [tmpq+19*%%str], m11 2595 mova [tmpq+23*%%str], m12 2596 mova [tmpq+27*%%str], m9 2597 mova [tmpq+31*%%str], m8 2598 2599 mova m15, [tmpq+30*%%str] ; t8 2600 mova m14, [tmpq+26*%%str] ; t9 2601 mova m13, [tmpq+22*%%str] ; t10 2602 mova m12, [tmpq+18*%%str] ; t11 2603 mova m11, [tmpq+14*%%str] ; t12 2604 mova m10, [tmpq+10*%%str] ; t13 2605 mova m9, [tmpq+ 6*%%str] ; t14 2606 mova m8, [tmpq+ 2*%%str] ; t15 2607 mova m7, [tmpq+21*%%str] ; t16 2608 mova m6, [tmpq+29*%%str] ; t17 2609 mova m5, [tmpq+ 9*%%str] ; t18 2610 mova m4, [tmpq+13*%%str] ; t19 2611 mova m3, [tmpq+17*%%str] ; t20 2612 mova m2, [tmpq+ 1*%%str] ; t21 2613 mova m1, [tmpq+25*%%str] ; t22 2614 2615 SUMSUB_BA w, 7, 8, 0 2616 mova [tmpq+ 2*%%str], m8 2617 mova m0, [tmpq+ 5*%%str] ; t23 2618 SUMSUB_BA w, 6, 9, 8 2619 SUMSUB_BA w, 5, 10, 8 2620 SUMSUB_BA w, 4, 11, 8 2621 SUMSUB_BA w, 3, 12, 8 2622 SUMSUB_BA w, 2, 13, 8 2623 SUMSUB_BA w, 1, 14, 8 2624 SUMSUB_BA w, 0, 15, 8 2625 2626 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 2627 mova [tmpq+ 1*%%str], m0 2628 mova [tmpq+ 5*%%str], m1 2629 mova [tmpq+ 9*%%str], m2 2630 mova [tmpq+13*%%str], m3 2631 mova [tmpq+17*%%str], m4 2632 mova [tmpq+21*%%str], m5 2633 mova [tmpq+25*%%str], m6 2634 mova [tmpq+29*%%str], m7 2635 2636 mova m8, [tmpq+ 2*%%str] 2637 TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 2638 mova [tmpq+ 2*%%str], m8 2639 mova [tmpq+ 6*%%str], m9 2640 mova [tmpq+10*%%str], m10 2641 mova [tmpq+14*%%str], m11 2642 mova [tmpq+18*%%str], m12 2643 mova [tmpq+22*%%str], m13 2644 mova [tmpq+26*%%str], m14 2645 mova [tmpq+30*%%str], m15 2646%endif ; avx2 2647%else 2648 mova m2, [tmpq+24*%%str] ; t6 2649 mova m3, [tmpq+28*%%str] ; t7 2650 SUMSUB_BADC w, 5, 2, 4, 3 2651 mova [tmpq+24*%%str], m5 2652 mova [tmpq+23*%%str], m2 2653 mova [tmpq+28*%%str], m4 2654 mova [tmpq+19*%%str], m3 2655 2656 mova m2, [tmpq+16*%%str] ; t4 2657 mova m3, [tmpq+20*%%str] ; t5 2658 SUMSUB_BA w, 1, 2, 5 2659 SUMSUB_BA w, 7, 3, 5 2660 mova [tmpq+15*%%str], m2 2661 mova [tmpq+11*%%str], m3 2662 2663 mova m2, [tmpq+ 0*%%str] ; t0 2664 mova m3, [tmpq+ 4*%%str] ; t1 2665 SUMSUB_BA w, 6, 2, 5 2666 SUMSUB_BA w, 0, 3, 5 2667 mova [tmpq+31*%%str], m2 2668 mova [tmpq+27*%%str], m3 2669 2670 mova m2, [tmpq+ 8*%%str] ; t2 2671 mova m3, [tmpq+12*%%str] ; t3 2672 mova m5, [tmpq+ 7*%%str] 2673 mova m4, [tmpq+ 3*%%str] 2674 SUMSUB_BADC w, 5, 2, 4, 3 2675 mova [tmpq+ 7*%%str], m2 2676 mova [tmpq+ 3*%%str], m3 2677 2678 mova m3, [tmpq+28*%%str] 2679 TRANSPOSE8x8W 6, 0, 5, 4, 1, 7, 2, 3, [tmpq+24*%%str], [tmpq+16*%%str], 1 2680 mova [tmpq+ 0*%%str], m6 2681 mova [tmpq+ 4*%%str], m0 2682 mova [tmpq+ 8*%%str], m5 2683 mova [tmpq+12*%%str], m4 2684 mova [tmpq+20*%%str], m7 2685 mova [tmpq+24*%%str], m2 2686 mova [tmpq+28*%%str], m3 2687 2688 mova m6, [tmpq+19*%%str] 2689 mova m0, [tmpq+23*%%str] 2690 mova m5, [tmpq+11*%%str] 2691 mova m4, [tmpq+15*%%str] 2692 mova m1, [tmpq+ 3*%%str] 2693 mova m7, [tmpq+ 7*%%str] 2694 mova m3, [tmpq+31*%%str] 2695 TRANSPOSE8x8W 6, 0, 5, 4, 1, 7, 2, 3, [tmpq+27*%%str], [tmpq+19*%%str], 1 2696 mova [tmpq+ 3*%%str], m6 2697 mova [tmpq+ 7*%%str], m0 2698 mova [tmpq+11*%%str], m5 2699 mova [tmpq+15*%%str], m4 2700 mova [tmpq+23*%%str], m7 2701 mova [tmpq+27*%%str], m2 2702 mova [tmpq+31*%%str], m3 2703 2704 mova m1, [tmpq+ 6*%%str] ; t14 2705 mova m0, [tmpq+ 2*%%str] ; t15 2706 mova m7, [tmpq+21*%%str] ; t16 2707 mova m6, [tmpq+29*%%str] ; t17 2708 SUMSUB_BA w, 7, 0, 2 2709 SUMSUB_BA w, 6, 1, 2 2710 mova [tmpq+29*%%str], m7 2711 mova [tmpq+ 2*%%str], m0 2712 mova [tmpq+21*%%str], m6 2713 mova [tmpq+ 6*%%str], m1 2714 2715 mova m1, [tmpq+14*%%str] ; t12 2716 mova m0, [tmpq+10*%%str] ; t13 2717 mova m5, [tmpq+ 9*%%str] ; t18 2718 mova m4, [tmpq+13*%%str] ; t19 2719 SUMSUB_BA w, 5, 0, 2 2720 SUMSUB_BA w, 4, 1, 2 2721 mova [tmpq+10*%%str], m0 2722 mova [tmpq+14*%%str], m1 2723 2724 mova m1, [tmpq+22*%%str] ; t10 2725 mova m0, [tmpq+18*%%str] ; t11 2726 mova m3, [tmpq+17*%%str] ; t20 2727 mova m2, [tmpq+ 1*%%str] ; t21 2728 SUMSUB_BA w, 3, 0, 6 2729 SUMSUB_BA w, 2, 1, 6 2730 mova [tmpq+18*%%str], m0 2731 mova [tmpq+22*%%str], m1 2732 2733 mova m7, [tmpq+30*%%str] ; t8 2734 mova m6, [tmpq+26*%%str] ; t9 2735 mova m1, [tmpq+25*%%str] ; t22 2736 mova m0, [tmpq+ 5*%%str] ; t23 2737 SUMSUB_BADC w, 1, 6, 0, 7 2738 mova [tmpq+26*%%str], m6 2739 mova [tmpq+30*%%str], m7 2740 2741 mova m7, [tmpq+29*%%str] 2742 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+21*%%str], [tmpq+17*%%str], 1 2743 mova [tmpq+ 1*%%str], m0 2744 mova [tmpq+ 5*%%str], m1 2745 mova [tmpq+ 9*%%str], m2 2746 mova [tmpq+13*%%str], m3 2747 mova [tmpq+21*%%str], m5 2748 mova [tmpq+25*%%str], m6 2749 mova [tmpq+29*%%str], m7 2750 2751 mova m0, [tmpq+ 2*%%str] 2752 mova m1, [tmpq+ 6*%%str] 2753 mova m2, [tmpq+10*%%str] 2754 mova m3, [tmpq+14*%%str] 2755 mova m4, [tmpq+18*%%str] 2756 mova m5, [tmpq+22*%%str] 2757 mova m7, [tmpq+30*%%str] 2758 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+26*%%str], [tmpq+18*%%str], 1 2759 mova [tmpq+ 2*%%str], m0 2760 mova [tmpq+ 6*%%str], m1 2761 mova [tmpq+10*%%str], m2 2762 mova [tmpq+14*%%str], m3 2763 mova [tmpq+22*%%str], m5 2764 mova [tmpq+26*%%str], m6 2765 mova [tmpq+30*%%str], m7 2766%endif 2767%else 2768 ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str] 2769 ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str] 2770 ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str] 2771 ; t20-22 is in m4-6 2772 ; t24-31 is in m8-15 2773 2774%if cpuflag(ssse3) 2775%define ROUND_REG [pw_512] 2776%else 2777%define ROUND_REG [pw_32] 2778%endif 2779 2780%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs 2781 SUMSUB_BA w, %4, %1, %5 2782 SUMSUB_BA w, %3, %2, %5 2783 VP9_IDCT8_WRITEx2 %4, %3, %5, %6, %7, ROUND_REG, 6 2784%if %8 == 1 2785 add dstq, stride2q 2786%endif 2787 VP9_IDCT8_WRITEx2 %2, %1, %5, %6, %7, ROUND_REG, 6, dst_endq 2788%if %8 == 1 2789 sub dst_endq, stride2q 2790%endif 2791%endmacro 2792 2793%if ARCH_X86_64 2794 pxor m10, m10 2795 2796 ; store t0-1 and t30-31 2797 mova m8, [tmpq+ 0*%%str] 2798 mova m9, [tmpq+ 4*%%str] 2799 %%STORE_2X2 8, 9, 0, 6, 12, 11, 10 2800 2801 ; store t2-3 and t28-29 2802 mova m8, [tmpq+ 8*%%str] 2803 mova m9, [tmpq+12*%%str] 2804 %%STORE_2X2 8, 9, 14, 15, 12, 11, 10 2805 2806 ; store t4-5 and t26-27 2807 mova m8, [tmpq+16*%%str] 2808 mova m9, [tmpq+20*%%str] 2809 %%STORE_2X2 8, 9, 7, 1, 12, 11, 10 2810 2811 ; store t6-7 and t24-25 2812 mova m8, [tmpq+24*%%str] 2813 mova m9, [tmpq+28*%%str] 2814 %%STORE_2X2 8, 9, 4, 5, 12, 11, 10 2815 2816 ; store t8-9 and t22-23 2817 mova m8, [tmpq+30*%%str] 2818 mova m9, [tmpq+26*%%str] 2819 mova m0, [tmpq+ 5*%%str] 2820 %%STORE_2X2 8, 9, 13, 0, 12, 11, 10 2821 2822 ; store t10-11 and t20-21 2823 mova m8, [tmpq+22*%%str] 2824 mova m9, [tmpq+18*%%str] 2825 %%STORE_2X2 8, 9, 2, 3, 12, 11, 10 2826 2827 ; store t12-13 and t18-19 2828 mova m8, [tmpq+14*%%str] 2829 mova m9, [tmpq+10*%%str] 2830 mova m5, [tmpq+13*%%str] 2831 mova m4, [tmpq+ 9*%%str] 2832 %%STORE_2X2 8, 9, 4, 5, 12, 11, 10 2833 2834 ; store t14-17 2835 mova m8, [tmpq+ 6*%%str] 2836 mova m9, [tmpq+ 2*%%str] 2837 mova m5, [tmpq+29*%%str] 2838 mova m4, [tmpq+21*%%str] 2839 %%STORE_2X2 8, 9, 4, 5, 12, 11, 10, 0 2840 2841 SWAP 1, 10 ; zero 2842%else 2843 mova [tmpq+ 1*%%str], m1 2844 mova [tmpq+11*%%str], m2 2845 mova [tmpq+15*%%str], m3 2846 mova [tmpq+17*%%str], m4 2847 mova [tmpq+19*%%str], m5 2848 pxor m1, m1 2849 2850 ; store t0-1 and t30-31 2851 mova m2, [tmpq+ 0*%%str] 2852 mova m3, [tmpq+ 4*%%str] 2853 %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2854 2855 ; store t2-3 and t28-29 2856 mova m2, [tmpq+ 8*%%str] 2857 mova m3, [tmpq+12*%%str] 2858 mova m0, [tmpq+ 3*%%str] 2859 mova m6, [tmpq+ 7*%%str] 2860 %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2861 2862 ; store t4-5 and t26-27 2863 mova m2, [tmpq+16*%%str] 2864 mova m3, [tmpq+20*%%str] 2865 mova m0, [tmpq+ 1*%%str] 2866 %%STORE_2X2 2, 3, 7, 0, 4, 5, 1 2867 2868 ; store t6-7 and t24-25 2869 mova m2, [tmpq+24*%%str] 2870 mova m3, [tmpq+28*%%str] 2871 mova m0, [tmpq+17*%%str] 2872 mova m6, [tmpq+19*%%str] 2873 %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2874 2875 ; store t8-9 and t22-23 2876 mova m2, [tmpq+30*%%str] 2877 mova m3, [tmpq+26*%%str] 2878 mova m0, [tmpq+25*%%str] 2879 mova m6, [tmpq+ 5*%%str] 2880 %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2881 2882 ; store t10-11 and t20-21 2883 mova m2, [tmpq+22*%%str] 2884 mova m3, [tmpq+18*%%str] 2885 mova m0, [tmpq+11*%%str] 2886 mova m6, [tmpq+15*%%str] 2887 %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2888 2889 ; store t12-13 and t18-19 2890 mova m2, [tmpq+14*%%str] 2891 mova m3, [tmpq+10*%%str] 2892 mova m6, [tmpq+13*%%str] 2893 mova m0, [tmpq+ 9*%%str] 2894 %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2895 2896 ; store t14-17 2897 mova m2, [tmpq+ 6*%%str] 2898 mova m3, [tmpq+ 2*%%str] 2899 mova m6, [tmpq+29*%%str] 2900 mova m0, [tmpq+21*%%str] 2901 %%STORE_2X2 2, 3, 0, 6, 4, 5, 1, 0 2902%endif 2903%undef ROUND_REG 2904%endif 2905%endmacro 2906 2907%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1 2908INIT_XMM %1 2909cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob 2910 movifnidn eobd, dword eobm 2911%if cpuflag(ssse3) 2912 cmp eobd, 135 2913 jg .idctfull 2914 cmp eobd, 34 2915 jg .idct16x16 2916 cmp eobd, 1 2917 jg .idct8x8 2918%else 2919 cmp eobd, 1 2920 jg .idctfull 2921%endif 2922 2923 ; dc-only case 2924 movifnidn blockq, blockmp 2925 movifnidn dstq, dstmp 2926 movifnidn strideq, stridemp 2927%if cpuflag(ssse3) 2928 movd m0, [blockq] 2929 mova m1, [pw_11585x2] 2930 pmulhrsw m0, m1 2931 pmulhrsw m0, m1 2932%else 2933 DEFINE_ARGS dst, stride, block, coef 2934 movsx coefd, word [blockq] 2935 imul coefd, 11585 2936 add coefd, 8192 2937 sar coefd, 14 2938 imul coefd, 11585 2939 add coefd, (32 << 14) + 8192 2940 sar coefd, 14 + 6 2941 movd m0, coefd 2942%endif 2943 SPLATW m0, m0, q0000 2944%if cpuflag(ssse3) 2945 pmulhrsw m0, [pw_512] 2946%endif 2947 pxor m5, m5 2948 movd [blockq], m5 2949%rep 31 2950 VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize 2951 add dstq, strideq 2952%endrep 2953 VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize 2954 RET 2955 2956%if ARCH_X86_64 2957 DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp 2958%else 2959%define dst_bakq r0mp 2960%endif 2961%if cpuflag(ssse3) 2962.idct8x8: 2963%if ARCH_X86_32 2964 DEFINE_ARGS block, u1, u2, u3, u4, tmp 2965 mov blockq, r2mp 2966%endif 2967 mov tmpq, rsp 2968 VP9_IDCT32_1D blockq, 1, 8 2969 2970%if ARCH_X86_32 2971 DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp 2972 mov strideq, r1mp 2973%define cntd dword r3m 2974%endif 2975 mov stride30q, strideq ; stride 2976 lea stride2q, [strideq*2] ; stride*2 2977 shl stride30q, 5 ; stride*32 2978 mov cntd, 4 2979 sub stride30q, stride2q ; stride*30 2980.loop2_8x8: 2981 mov dstq, dst_bakq 2982 lea dst_endq, [dstq+stride30q] 2983 VP9_IDCT32_1D tmpq, 2, 8 2984 add dst_bakq, 8 2985 add tmpq, 16 2986 dec cntd 2987 jg .loop2_8x8 2988 2989 ; at the end of the loop, m7 should still be zero 2990 ; use that to zero out block coefficients 2991%if ARCH_X86_32 2992 DEFINE_ARGS block 2993 mov blockq, r2mp 2994%endif 2995 ZERO_BLOCK blockq, 64, 8, m1 2996 RET 2997 2998.idct16x16: 2999%if ARCH_X86_32 3000 DEFINE_ARGS block, tmp, cnt 3001 mov blockq, r2mp 3002%endif 3003 mov cntd, 2 3004 mov tmpq, rsp 3005.loop1_16x16: 3006 VP9_IDCT32_1D blockq, 1, 16 3007 add blockq, 16 3008 add tmpq, 512 3009 dec cntd 3010 jg .loop1_16x16 3011 3012%if ARCH_X86_64 3013 sub blockq, 32 3014%else 3015 DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp 3016 mov strideq, r1mp 3017%define cntd dword r3m 3018%endif 3019 3020 mov stride30q, strideq ; stride 3021 lea stride2q, [strideq*2] ; stride*2 3022 shl stride30q, 5 ; stride*32 3023 mov cntd, 4 3024 mov tmpq, rsp 3025 sub stride30q, stride2q ; stride*30 3026.loop2_16x16: 3027 mov dstq, dst_bakq 3028 lea dst_endq, [dstq+stride30q] 3029 VP9_IDCT32_1D tmpq, 2, 16 3030 add dst_bakq, 8 3031 add tmpq, 16 3032 dec cntd 3033 jg .loop2_16x16 3034 3035 ; at the end of the loop, m7 should still be zero 3036 ; use that to zero out block coefficients 3037%if ARCH_X86_32 3038 DEFINE_ARGS block 3039 mov blockq, r2mp 3040%endif 3041 ZERO_BLOCK blockq, 64, 16, m1 3042 RET 3043%endif 3044 3045.idctfull: 3046%if ARCH_X86_32 3047 DEFINE_ARGS block, tmp, cnt 3048 mov blockq, r2mp 3049%endif 3050 mov cntd, 4 3051 mov tmpq, rsp 3052.loop1_full: 3053 VP9_IDCT32_1D blockq, 1 3054 add blockq, 16 3055 add tmpq, 512 3056 dec cntd 3057 jg .loop1_full 3058 3059%if ARCH_X86_64 3060 sub blockq, 64 3061%else 3062 DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp 3063 mov strideq, r1mp 3064%define cntd dword r3m 3065%endif 3066 3067 mov stride30q, strideq ; stride 3068 lea stride2q, [strideq*2] ; stride*2 3069 shl stride30q, 5 ; stride*32 3070 mov cntd, 4 3071 mov tmpq, rsp 3072 sub stride30q, stride2q ; stride*30 3073.loop2_full: 3074 mov dstq, dst_bakq 3075 lea dst_endq, [dstq+stride30q] 3076 VP9_IDCT32_1D tmpq, 2 3077 add dst_bakq, 8 3078 add tmpq, 16 3079 dec cntd 3080 jg .loop2_full 3081 3082 ; at the end of the loop, m7 should still be zero 3083 ; use that to zero out block coefficients 3084%if ARCH_X86_32 3085 DEFINE_ARGS block 3086 mov blockq, r2mp 3087%endif 3088 ZERO_BLOCK blockq, 64, 32, m1 3089 RET 3090%endmacro 3091 3092VP9_IDCT_IDCT_32x32_ADD_XMM sse2 3093VP9_IDCT_IDCT_32x32_ADD_XMM ssse3 3094VP9_IDCT_IDCT_32x32_ADD_XMM avx 3095 3096; this is almost identical to VP9_STORE_2X, but it does two rows 3097; for slightly improved interleaving, and it omits vpermq since the 3098; input is DC so all values are identical 3099%macro VP9_STORE_YMM_DC_2X2 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero 3100 mova m%2, [dstq] 3101 mova m%4, [dstq+strideq] 3102 punpckhbw m%3, m%2, m%6 3103 punpcklbw m%2, m%6 3104 punpckhbw m%5, m%4, m%6 3105 punpcklbw m%4, m%6 3106 paddw m%3, m%1 3107 paddw m%2, m%1 3108 paddw m%5, m%1 3109 paddw m%4, m%1 3110 packuswb m%2, m%3 3111 packuswb m%4, m%5 3112 mova [dstq+strideq*0], m%2 3113 mova [dstq+strideq*1], m%4 3114%endmacro 3115 3116%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 3117INIT_YMM avx2 3118cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob 3119 cmp eobd, 135 3120 jg .idctfull 3121 cmp eobd, 1 3122 jg .idct16x16 3123 3124 ; dc-only case 3125 mova m1, [pw_11585x2] 3126 vpbroadcastw m0, [blockq] 3127 pmulhrsw m0, m1 3128 pmulhrsw m0, m1 3129 pxor m5, m5 3130 pmulhrsw m0, [pw_512] 3131 movd [blockq], xm5 3132 3133 DEFINE_ARGS dst, stride, cnt 3134 mov cntd, 16 3135.loop_dc: 3136 VP9_STORE_YMM_DC_2X2 0, 1, 2, 3, 4, 5 3137 lea dstq, [dstq+2*strideq] 3138 dec cntd 3139 jg .loop_dc 3140 RET 3141 3142 DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp 3143.idct16x16: 3144 mov tmpq, rsp 3145 VP9_IDCT32_1D blockq, 1, 16 3146 3147 mov stride30q, strideq ; stride 3148 lea stride2q, [strideq*2] ; stride*2 3149 shl stride30q, 5 ; stride*32 3150 mov cntd, 2 3151 sub stride30q, stride2q ; stride*30 3152.loop2_16x16: 3153 mov dstq, dst_bakq 3154 lea dst_endq, [dstq+stride30q] 3155 VP9_IDCT32_1D tmpq, 2, 16 3156 add dst_bakq, 16 3157 add tmpq, 32 3158 dec cntd 3159 jg .loop2_16x16 3160 3161 ; at the end of the loop, m1 should still be zero 3162 ; use that to zero out block coefficients 3163 ZERO_BLOCK blockq, 64, 16, m1 3164 RET 3165 3166.idctfull: 3167 mov cntd, 2 3168 mov tmpq, rsp 3169.loop1_full: 3170 VP9_IDCT32_1D blockq, 1 3171 add blockq, 32 3172 add tmpq, 1024 3173 dec cntd 3174 jg .loop1_full 3175 3176 sub blockq, 64 3177 3178 mov stride30q, strideq ; stride 3179 lea stride2q, [strideq*2] ; stride*2 3180 shl stride30q, 5 ; stride*32 3181 mov cntd, 2 3182 mov tmpq, rsp 3183 sub stride30q, stride2q ; stride*30 3184.loop2_full: 3185 mov dstq, dst_bakq 3186 lea dst_endq, [dstq+stride30q] 3187 VP9_IDCT32_1D tmpq, 2 3188 add dst_bakq, 16 3189 add tmpq, 32 3190 dec cntd 3191 jg .loop2_full 3192 3193 ; at the end of the loop, m1 should still be zero 3194 ; use that to zero out block coefficients 3195 ZERO_BLOCK blockq, 64, 32, m1 3196 RET 3197%endif 3198