1;******************************************************************************* 2;* SIMD-optimized IDCT functions for HEVC decoding 3;* Copyright (c) 2014 Pierre-Edouard LEPERE 4;* Copyright (c) 2014 James Almer 5;* Copyright (c) 2016 Alexandra Hájková 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27 28pd_64: times 4 dd 64 29pd_2048: times 4 dd 2048 30pd_512: times 4 dd 512 31 32; 4x4 transform coeffs 33cextern pw_64 34pw_64_m64: times 4 dw 64, -64 35pw_83_36: times 4 dw 83, 36 36pw_36_m83: times 4 dw 36, -83 37 38; 8x8 transform coeffs 39pw_89_75: times 4 dw 89, 75 40pw_50_18: times 4 dw 50, 18 41 42pw_75_m18: times 4 dw 75, -18 43pw_m89_m50: times 4 dw -89, -50 44 45pw_50_m89: times 4 dw 50, -89 46pw_18_75: times 4 dw 18, 75 47 48pw_18_m50: times 4 dw 18, -50 49pw_75_m89: times 4 dw 75, -89 50 51; 16x16 transformation coeffs 52trans_coeffs16: times 4 dw 90, 87 53times 4 dw 80, 70 54times 4 dw 57, 43 55times 4 dw 25, 9 56 57times 4 dw 87, 57 58times 4 dw 9, -43 59times 4 dw -80, -90 60times 4 dw -70, -25 61 62times 4 dw 80, 9 63times 4 dw -70, -87 64times 4 dw -25, 57 65times 4 dw 90, 43 66 67times 4 dw 70, -43 68times 4 dw -87, 9 69times 4 dw 90, 25 70times 4 dw -80, -57 71 72times 4 dw 57, -80 73times 4 dw -25, 90 74times 4 dw -9, -87 75times 4 dw 43, 70 76 77times 4 dw 43, -90 78times 4 dw 57, 25 79times 4 dw -87, 70 80times 4 dw 9, -80 81 82times 4 dw 25, -70 83times 4 dw 90, -80 84times 4 dw 43, 9 85times 4 dw -57, 87 86 87times 4 dw 9, -25 88times 4 dw 43, -57 89times 4 dw 70, -80 90times 4 dw 87, -90 91 92; 32x32 transform coeffs 93trans_coeff32: times 8 dw 90 94times 4 dw 88, 85 95times 4 dw 82, 78 96times 4 dw 73, 67 97times 4 dw 61, 54 98times 4 dw 46, 38 99times 4 dw 31, 22 100times 4 dw 13, 4 101 102times 4 dw 90, 82 103times 4 dw 67, 46 104times 4 dw 22, -4 105times 4 dw -31, -54 106times 4 dw -73, -85 107times 4 dw -90, -88 108times 4 dw -78, -61 109times 4 dw -38, -13 110 111times 4 dw 88, 67 112times 4 dw 31, -13 113times 4 dw -54, -82 114times 4 dw -90, -78 115times 4 dw -46, -4 116times 4 dw 38, 73 117times 4 dw 90, 85 118times 4 dw 61, 22 119 120times 4 dw 85, 46 121times 4 dw -13, -67 122times 4 dw -90, -73 123times 4 dw -22, 38 124times 4 dw 82, 88 125times 4 dw 54, -4 126times 4 dw -61, -90 127times 4 dw -78, -31 128 129times 4 dw 82, 22 130times 4 dw -54, -90 131times 4 dw -61, 13 132times 4 dw 78, 85 133times 4 dw 31, -46 134times 4 dw -90, -67 135times 4 dw 4, 73 136times 4 dw 88, 38 137 138times 4 dw 78, -4 139times 4 dw -82, -73 140times 4 dw 13, 85 141times 4 dw 67, -22 142times 4 dw -88, -61 143times 4 dw 31, 90 144times 4 dw 54, -38 145times 4 dw -90, -46 146 147times 4 dw 73, -31 148times 4 dw -90, -22 149times 4 dw 78, 67 150times 4 dw -38, -90 151times 4 dw -13, 82 152times 4 dw 61, -46 153times 4 dw -88, -4 154times 4 dw 85, 54 155 156times 4 dw 67, -54 157times 4 dw -78, 38 158times 4 dw 85, -22 159times 4 dw -90, 4 160times 4 dw 90, 13 161times 4 dw -88, -31 162times 4 dw 82, 46 163times 4 dw -73, -61 164 165times 4 dw 61, -73 166times 4 dw -46, 82 167times 4 dw 31, -88 168times 4 dw -13, 90 169times 4 dw -4, -90 170times 4 dw 22, 85 171times 4 dw -38, -78 172times 4 dw 54, 67 173 174times 4 dw 54, -85 175times 4 dw -4, 88 176times 4 dw -46, -61 177times 4 dw 82, 13 178times 4 dw -90, 38 179times 4 dw 67, -78 180times 4 dw -22, 90 181times 4 dw -31, -73 182 183times 4 dw 46, -90 184times 4 dw 38, 54 185times 4 dw -90, 31 186times 4 dw 61, -88 187times 4 dw 22, 67 188times 4 dw -85, 13 189times 4 dw 73, -82 190times 4 dw 4, 78 191 192times 4 dw 38, -88 193times 4 dw 73, -4 194times 4 dw -67, 90 195times 4 dw -46, -31 196times 4 dw 85, -78 197times 4 dw 13, 61 198times 4 dw -90, 54 199times 4 dw 22, -82 200 201times 4 dw 31, -78 202times 4 dw 90, -61 203times 4 dw 4, 54 204times 4 dw -88, 82 205times 4 dw -38, -22 206times 4 dw 73, -90 207times 4 dw 67, -13 208times 4 dw -46, 85 209 210times 4 dw 22, -61 211times 4 dw 85, -90 212times 4 dw 73, -38 213times 4 dw -4, 46 214times 4 dw -78, 90 215times 4 dw -82, 54 216times 4 dw -13, -31 217times 4 dw 67, -88 218 219times 4 dw 13, -38 220times 4 dw 61, -78 221times 4 dw 88, -90 222times 4 dw 85, -73 223times 4 dw 54, -31 224times 4 dw 4, 22 225times 4 dw -46, 67 226times 4 dw -82, 90 227 228times 4 dw 4, -13 229times 4 dw 22, -31 230times 4 dw 38, -46 231times 4 dw 54, -61 232times 4 dw 67, -73 233times 4 dw 78, -82 234times 4 dw 85, -88 235times 4 dw 90, -90 236 237SECTION .text 238 239; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs) 240; %1 = HxW 241; %2 = number of loops 242; %3 = bitdepth 243%macro IDCT_DC 3 244cglobal hevc_idct_%1x%1_dc_%3, 1, 2, 1, coeff, tmp 245 movsx tmpd, word [coeffq] 246 add tmpd, (1 << (14 - %3)) + 1 247 sar tmpd, (15 - %3) 248 movd xm0, tmpd 249 SPLATW m0, xm0 250 DEFINE_ARGS coeff, cnt 251 mov cntd, %2 252.loop: 253 mova [coeffq+mmsize*0], m0 254 mova [coeffq+mmsize*1], m0 255 mova [coeffq+mmsize*2], m0 256 mova [coeffq+mmsize*3], m0 257 add coeffq, mmsize*8 258 mova [coeffq+mmsize*-4], m0 259 mova [coeffq+mmsize*-3], m0 260 mova [coeffq+mmsize*-2], m0 261 mova [coeffq+mmsize*-1], m0 262 dec cntd 263 jg .loop 264 RET 265%endmacro 266 267; %1 = HxW 268; %2 = bitdepth 269%macro IDCT_DC_NL 2 ; No loop 270cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp 271 movsx tmpd, word [coeffq] 272 add tmpd, (1 << (14 - %2)) + 1 273 sar tmpd, (15 - %2) 274 movd m0, tmpd 275 SPLATW m0, xm0 276 mova [coeffq+mmsize*0], m0 277 mova [coeffq+mmsize*1], m0 278 mova [coeffq+mmsize*2], m0 279 mova [coeffq+mmsize*3], m0 280%if mmsize == 16 281 mova [coeffq+mmsize*4], m0 282 mova [coeffq+mmsize*5], m0 283 mova [coeffq+mmsize*6], m0 284 mova [coeffq+mmsize*7], m0 285%endif 286 RET 287%endmacro 288 289; IDCT 4x4, expects input in m0, m1 290; %1 - shift 291; %2 - 1/0 - SCALE and Transpose or not 292; %3 - 1/0 add constant or not 293%macro TR_4x4 3 294 ; interleaves src0 with src2 to m0 295 ; and src1 with scr3 to m2 296 ; src0: 00 01 02 03 m0: 00 20 01 21 02 22 03 23 297 ; src1: 10 11 12 13 --> 298 ; src2: 20 21 22 23 m1: 10 30 11 31 12 32 13 33 299 ; src3: 30 31 32 33 300 301 SBUTTERFLY wd, 0, 1, 2 302 303 pmaddwd m2, m0, [pw_64] ; e0 304 pmaddwd m3, m1, [pw_83_36] ; o0 305 pmaddwd m0, [pw_64_m64] ; e1 306 pmaddwd m1, [pw_36_m83] ; o1 307 308%if %3 == 1 309 %assign %%add 1 << (%1 - 1) 310 mova m4, [pd_ %+ %%add] 311 paddd m2, m4 312 paddd m0, m4 313%endif 314 315 SUMSUB_BADC d, 3, 2, 1, 0, 4 316 317%if %2 == 1 318 psrad m3, %1 ; e0 + o0 319 psrad m1, %1 ; e1 + o1 320 psrad m2, %1 ; e0 - o0 321 psrad m0, %1 ; e1 - o1 322 ;clip16 323 packssdw m3, m1 324 packssdw m0, m2 325 ; Transpose 326 SBUTTERFLY wd, 3, 0, 1 327 SBUTTERFLY wd, 3, 0, 1 328 SWAP 3, 1, 0 329%else 330 SWAP 3, 2, 0 331%endif 332%endmacro 333 334%macro DEFINE_BIAS 1 335 %assign shift (20 - %1) 336 %assign c_add (1 << (shift - 1)) 337 %define arr_add pd_ %+ c_add 338%endmacro 339 340; %1 - bit_depth 341; %2 - register add constant 342; is loaded to 343; shift = 20 - bit_depth 344%macro LOAD_BIAS 2 345 DEFINE_BIAS %1 346 mova %2, [arr_add] 347%endmacro 348 349; %1, %2 - registers to load packed 16 bit values to 350; %3, %4, %5, %6 - vertical offsets 351; %7 - horizontal offset 352%macro LOAD_BLOCK 7 353 movq %1, [r0 + %3 + %7] 354 movhps %1, [r0 + %5 + %7] 355 movq %2, [r0 + %4 + %7] 356 movhps %2, [r0 + %6 + %7] 357%endmacro 358 359; void ff_hevc_idct_4x4__{8,10}_<opt>(int16_t *coeffs, int col_limit) 360; %1 = bitdepth 361%macro IDCT_4x4 1 362cglobal hevc_idct_4x4_%1, 1, 1, 5, coeffs 363 mova m0, [coeffsq] 364 mova m1, [coeffsq + 16] 365 366 TR_4x4 7, 1, 1 367 TR_4x4 20 - %1, 1, 1 368 369 mova [coeffsq], m0 370 mova [coeffsq + 16], m1 371 RET 372%endmacro 373 374; scale, pack (clip16) and store the residuals 0 e8[0] + o8[0] --> + %1 375; 4 at one time (4 columns) 1 e8[1] + o8[1] 376; from %5: e8/16 + o8/16, with %1 offset ... 377; and %3: e8/16 - o8/16, with %2 offset 6 e8[1] - o8[1] 378; %4 - shift 7 e8[0] - o8[0] --> + %2 379%macro STORE_8 7 380 psrad %5, %4 381 psrad %3, %4 382 packssdw %5, %3 383 movq [coeffsq + %1], %5 384 movhps [coeffsq + %2], %5 385%endmacro 386 387; %1 - horizontal offset 388; %2 - shift 389; %3, %4 - transform coeffs 390; %5 - vertical offset for e8 + o8 391; %6 - vertical offset for e8 - o8 392; %7 - register with e8 inside 393; %8 - block_size 394; %9 - register to store e8 +o8 395; %10 - register to store e8 - o8 396%macro E8_O8 10 397 pmaddwd m6, m4, %3 398 pmaddwd m7, m5, %4 399 400 paddd m6, m7 401 paddd m7, m6, %7 ; o8 + e8 402 psubd %7, m6 ; e8 - o8 403%if %8 == 8 404 STORE_8 %5 + %1, %6 + %1, %7, %2, m7, 0, 0 405%else 406 SWAP m7, %9 407 SWAP %7, %10 408%endif 409%endmacro 410 411; 8x4 residuals are processed and stored 412; %1 - horizontal offset 413; %2 - shift 414; %3 - offset of the even row 415; %4 - step: 1 for 8x8, 2 for 16x16, 4 for 32x32 416; %5 - offset of the odd row 417; %6 - block size 418; %7 - 1/0 add a constant in TR_4x4 or not 419; I want to add a constant for 8x8 transform but not for 16x16 and 32x32 420%macro TR_8x4 7 421 ; load 4 columns of even rows 422 LOAD_BLOCK m0, m1, 0, 2 * %4 * %3, %4 * %3, 3 * %4 * %3, %1 423 424 TR_4x4 %2, 0, %7 ; e8: m0, m1, m2, m3, for 4 columns only 425 426 ; load 4 columns of odd rows 427 LOAD_BLOCK m4, m5, %4 * %5, 3 * %4 * %5, 5 * %4 * %5, 7 * %4 * %5, %1 428 429 ; 00 01 02 03 430 ; 10 11 12 13 m4: 10 30 11 31 12 32 13 33 431 432 ; ... -- > 433 ; m5: 50 70 51 71 52 72 53 73 434 ; 70 71 72 73 435 SBUTTERFLY wd, 4, 5, 6 436 437 E8_O8 %1, %2, [pw_89_75], [pw_50_18], 0, %5 * 7, m0, %6, m8, m15 438 E8_O8 %1, %2, [pw_75_m18], [pw_m89_m50], %5, %5 * 6, m1, %6, m9, m14 439 E8_O8 %1, %2, [pw_50_m89], [pw_18_75], %5 * 2, %5 * 5, m2, %6, m10, m13 440 E8_O8 %1, %2, [pw_18_m50], [pw_75_m89], %5 * 3, %5 * 4, m3, %6, m11, m12 441%endmacro 442 443%macro STORE_PACKED 7 444 movq [r0 + %3 + %7], %1 445 movhps [r0 + %4 + %7], %1 446 movq [r0 + %5 + %7], %2 447 movhps [r0 + %6 + %7], %2 448%endmacro 449 450; transpose 4x4 block packed 451; in %1 and %2 registers 452; %3 - temporary register 453%macro TRANSPOSE_4x4 3 454 SBUTTERFLY wd, %1, %2, %3 455 SBUTTERFLY dq, %1, %2, %3 456%endmacro 457 458; %1 - horizontal offset of the block i 459; %2 - vertical offset of the block i 460; %3 - width in bytes 461; %4 - vertical offset for the block j 462; %5 - horizontal offset for the block j 463%macro SWAP_BLOCKS 5 464 ; M_j 465 LOAD_BLOCK m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5 466 TRANSPOSE_4x4 4, 5, 6 467 468 ; M_i 469 LOAD_BLOCK m6, m7, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1 470 471 STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1 472 473 ; transpose and store M_i 474 SWAP m6, m4 475 SWAP m7, m5 476 TRANSPOSE_4x4 4, 5, 6 477 STORE_PACKED m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5 478%endmacro 479 480; %1 - horizontal offset 481; %2 - vertical offset of the block 482; %3 - width in bytes 483%macro TRANSPOSE_BLOCK 3 484 LOAD_BLOCK m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1 485 TRANSPOSE_4x4 4, 5, 6 486 STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1 487%endmacro 488 489%macro TRANSPOSE_8x8 0 490cglobal hevc_idct_transpose_8x8, 0, 0, 0 491 ; M1 M2 ^T = M1^t M3^t 492 ; M3 M4 M2^t M4^t 493 494 ; M1 4x4 block 495 TRANSPOSE_BLOCK 0, 0, 16 496 497 ; M2 and M3 498 SWAP_BLOCKS 0, 64, 16, 0, 8 499 500 ; M4 501 TRANSPOSE_BLOCK 8, 64, 16 502 503 ret 504%endmacro 505 506; void ff_hevc_idct_8x8_{8,10}_<opt>(int16_t *coeffs, int col_limit) 507; %1 = bitdepth 508%macro IDCT_8x8 1 509cglobal hevc_idct_8x8_%1, 1, 1, 8, coeffs 510 TR_8x4 0, 7, 32, 1, 16, 8, 1 511 TR_8x4 8, 7, 32, 1, 16, 8, 1 512 513 call hevc_idct_transpose_8x8_ %+ cpuname 514 515 DEFINE_BIAS %1 516 TR_8x4 0, shift, 32, 1, 16, 8, 1 517 TR_8x4 8, shift, 32, 1, 16, 8, 1 518 519 TAIL_CALL hevc_idct_transpose_8x8_ %+ cpuname, 1 520%endmacro 521 522; store intermedite e32 coeffs on stack 523; as 16x4 matrix 524; from m10: e8 + o8, with %6 offset 525; and %3: e8 - o8, with %7 offset 526; %4 - shift, unused here 527%macro STORE_16 7 528 mova [rsp + %6], %5 529 mova [rsp + %7], %3 530%endmacro 531 532; %1, %2 - transform constants 533; %3, %4 - regs with interleaved coeffs 534; %5 - 1/0 SWAP or add 535; %6, %7 - registers for intermidiate sums 536; %8 - accumulator register 537%macro ADD_ROWS 8 538 pmaddwd %6, %3, %1 539 pmaddwd %7, %4, %2 540 paddd %6, %7 541%if %5 == 1 542 SWAP %6, %8 543%else 544 paddd %8, %6 545%endif 546%endmacro 547 548; %1 - transform coeffs 549; %2, %3 offsets for storing e+o/e-o back to coeffsq 550; %4 - shift 551; %5 - add 552; %6 - block_size 553; %7 - register with e16 554; %8, %9 - stack offsets for storing e+o/e-o 555%macro E16_O16 9 556 ADD_ROWS [%1], [%1 + 16], m0, m1, 1, m5, m6, m7 557 ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m5, m6, m7 558 559%if %6 == 8 560 paddd %7, %5 561%endif 562 563 paddd m4, m7, %7 ; o16 + e16 564 psubd %7, m7 ; e16 - o16 565 STORE_%6 %2, %3, %7, %4, m4, %8, %9 566%endmacro 567 568%macro TR_16x4 10 569 ; produce 8x4 matrix of e16 coeffs 570 ; for 4 first rows and store it on stack (128 bytes) 571 TR_8x4 %1, 7, %4, %5, %6, %8, 0 572 573 ; load 8 even rows 574 LOAD_BLOCK m0, m1, %9 * %6, %9 * 3 * %6, %9 * 5 * %6, %9 * 7 * %6, %1 575 LOAD_BLOCK m2, m3, %9 * 9 * %6, %9 * 11 * %6, %9 * 13 * %6, %9 * 15 * %6, %1 576 577 SBUTTERFLY wd, 0, 1, 4 578 SBUTTERFLY wd, 2, 3, 4 579 580 E16_O16 trans_coeffs16, 0 + %1, 15 * %6 + %1, %2, %3, %7, m8, 0, 15 * 16 581 mova m8, %3 582 E16_O16 trans_coeffs16 + 64, %6 + %1, 14 * %6 + %1, %2, m8, %7, m9, 16, 14 * 16 583 E16_O16 trans_coeffs16 + 2 * 64, 2 * %6 + %1, 13 * %6 + %1, %2, m8, %7, m10, 2 * 16, 13 * 16 584 E16_O16 trans_coeffs16 + 3 * 64, 3 * %6 + %1, 12 * %6 + %1, %2, m8, %7, m11, 3 * 16, 12 * 16 585 E16_O16 trans_coeffs16 + 4 * 64, 4 * %6 + %1, 11 * %6 + %1, %2, m8, %7, m12, 4 * 16, 11 * 16 586 E16_O16 trans_coeffs16 + 5 * 64, 5 * %6 + %1, 10 * %6 + %1, %2, m8, %7, m13, 5 * 16, 10 * 16 587 E16_O16 trans_coeffs16 + 6 * 64, 6 * %6 + %1, 9 * %6 + %1, %2, m8, %7, m14, 6 * 16, 9 * 16 588 E16_O16 trans_coeffs16 + 7 * 64, 7 * %6 + %1, 8 * %6 + %1, %2, m8, %7, m15, 7 * 16, 8 * 16 589%endmacro 590 591%macro TRANSPOSE_16x16 0 592cglobal hevc_idct_transpose_16x16, 0, 0, 0 593; M1 M2 M3 M4 ^T m1 m5 m9 m13 M_i^T = m_i 594; M5 M6 M7 M8 --> m2 m6 m10 m14 595; M9 M10 M11 M12 m3 m7 m11 m15 596; M13 M14 M15 M16 m4 m8 m12 m16 597 598 ; M1 4x4 block 599 TRANSPOSE_BLOCK 0, 0, 32 600 601 ; M5, M2 602 SWAP_BLOCKS 0, 128, 32, 0, 8 603 ; M9, M3 604 SWAP_BLOCKS 0, 256, 32, 0, 16 605 ; M13, M4 606 SWAP_BLOCKS 0, 384, 32, 0, 24 607 608 ;M6 609 TRANSPOSE_BLOCK 8, 128, 32 610 611 ; M10, M7 612 SWAP_BLOCKS 8, 256, 32, 128, 16 613 ; M14, M8 614 SWAP_BLOCKS 8, 384, 32, 128, 24 615 616 ;M11 617 TRANSPOSE_BLOCK 16, 256, 32 618 619 ; M15, M12 620 SWAP_BLOCKS 16, 384, 32, 256, 24 621 622 ;M16 623 TRANSPOSE_BLOCK 24, 384, 32 624 625 ret 626%endmacro 627 628; void ff_hevc_idct_16x16_{8,10}_<opt>(int16_t *coeffs, int col_limit) 629; %1 = bitdepth 630%macro IDCT_16x16 1 631cglobal hevc_idct_16x16_%1, 1, 2, 16, coeffs 632 mov r1d, 3 633.loop16: 634 TR_16x4 8 * r1, 7, [pd_64], 64, 2, 32, 8, 16, 1, 0 635 dec r1d 636 jge .loop16 637 638 call hevc_idct_transpose_16x16_ %+ cpuname 639 640 DEFINE_BIAS %1 641 mov r1d, 3 642.loop16_2: 643 TR_16x4 8 * r1, shift, [arr_add], 64, 2, 32, 8, 16, 1, 1 644 dec r1d 645 jge .loop16_2 646 647 TAIL_CALL hevc_idct_transpose_16x16_ %+ cpuname, 1 648%endmacro 649 650; scale, pack (clip16) and store the residuals 0 e32[0] + o32[0] --> %1 651; 4 at one time (4 columns) 1 e32[1] + o32[1] 652; %1 - address to store e32 + o32 653; %2 - address to store e32 - e32 654; %5 - reg with e32 + o32 ... 655; %3 - reg with e32 - o32 30 e32[1] - o32[1] 656; %4 - shift 31 e32[0] - o32[0] --> %2 657%macro STORE_32 5 658 psrad %5, %4 659 psrad %3, %4 660 packssdw %5, %3 661 movq [%1], %5 662 movhps [%2], %5 663%endmacro 664 665; %1 - transform coeffs 666; %2 - stack offset for e32 667; %2, %3 offsets for storing e+o/e-o back to coeffsq 668; %4 - shift 669; %5 - stack offset of e32 670%macro E32_O32 5 671 ADD_ROWS [%1], [%1 + 16], m0, m1, 1, m8, m9, m10 672 ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m8, m9, m10 673 ADD_ROWS [%1 + 4 * 16], [%1 + 5 * 16], m4, m5, 0, m8, m9, m10 674 ADD_ROWS [%1 + 6 * 16], [%1 + 7 * 16], m6, m7, 0, m8, m9, m10 675 676 paddd m11, m14, [rsp + %5] 677 paddd m12, m10, m11 ; o32 + e32 678 psubd m11, m10 ; e32 - o32 679 STORE_32 %2, %3, m11, %4, m12 680%endmacro 681 682; %1 - horizontal offset 683; %2 - bitdepth 684%macro TR_32x4 3 685 TR_16x4 %1, 7, [pd_64], 128, 4, 64, 16, 16, 2, 0 686 687 LOAD_BLOCK m0, m1, 64, 3 * 64, 5 * 64, 7 * 64, %1 688 LOAD_BLOCK m2, m3, 9 * 64, 11 * 64, 13 * 64, 15 * 64, %1 689 LOAD_BLOCK m4, m5, 17 * 64, 19 * 64, 21 * 64, 23 * 64, %1 690 LOAD_BLOCK m6, m7, 25 * 64, 27 * 64, 29 * 64, 31 * 64, %1 691 692 SBUTTERFLY wd, 0, 1, 8 693 SBUTTERFLY wd, 2, 3, 8 694 SBUTTERFLY wd, 4, 5, 8 695 SBUTTERFLY wd, 6, 7, 8 696 697%if %3 == 1 698 %assign shift 7 699 mova m14, [pd_64] 700%else 701 LOAD_BIAS %2, m14 702%endif 703 704 lea r2, [trans_coeff32 + 15 * 128] 705 lea r3, [coeffsq + %1] 706 lea r4, [r3 + 16 * 64] 707 mov r5d, 15 * 16 708%%loop: 709 E32_O32 r2, r3 + r5 * 4, r4, shift, r5 710 sub r2, 128 711 add r4, 64 712 sub r5d, 16 713 jge %%loop 714%endmacro 715 716%macro TRANSPOSE_32x32 0 717cglobal hevc_idct_transpose_32x32, 0, 0, 0 718 ; M0 M1 ... M7 719 ; M8 M15 720 ; 721 ; ... 722 ; 723 ; M56 M63 724 725 TRANSPOSE_BLOCK 0, 0, 64 ; M1 726 mov r1d, 7 727 mov r2d, 7 * 256 728.loop_transpose: 729 SWAP_BLOCKS 0, r2, 64, 0, r1 * 8 730 sub r2d, 256 731 dec r1d 732 jg .loop_transpose 733 734 TRANSPOSE_BLOCK 8, 256, 64 ; M9 735 mov r1d, 6 736 mov r2d, 512 737 mov r3d, 16 738.loop_transpose2: 739 SWAP_BLOCKS 8, r2, 64, 256, r3 740 add r3d, 8 741 add r2d, 256 742 dec r1d 743 jg .loop_transpose2 744 745 TRANSPOSE_BLOCK 2 * 8, 2 * 256, 64 ; M9 746 mov r1d, 5 747 mov r2d, 768 748 mov r3d, 24 749.loop_transpose3: 750 SWAP_BLOCKS 2 * 8, r2, 64, 2 * 256, r3 751 add r3d, 8 752 add r2d, 256 753 dec r1d 754 jg .loop_transpose3 755 756 TRANSPOSE_BLOCK 3 * 8, 3 * 256, 64 ; M27 757 mov r1d, 4 758 mov r2d, 1024 759 mov r3d, 32 760.loop_transpose4: 761 SWAP_BLOCKS 3 * 8, r2, 64, 3 * 256, r3 762 add r3d, 8 763 add r2d, 256 764 dec r1d 765 jg .loop_transpose4 766 767 TRANSPOSE_BLOCK 4 * 8, 4 * 256, 64 ; M36 768 mov r1d, 3 769 mov r2d, 1280 770 mov r3d, 40 771.loop_transpose5: 772 SWAP_BLOCKS 4 * 8, r2, 64, 4 * 256, r3 773 add r3d, 8 774 add r2d, 256 775 dec r1d 776 jg .loop_transpose5 777 778 TRANSPOSE_BLOCK 5 * 8, 5 * 256, 64 ; M45 779 SWAP_BLOCKS 5 * 8, 6 * 256, 64, 5 * 256, 6 * 8 780 SWAP_BLOCKS 5 * 8, 7 * 256, 64, 5 * 256, 7 * 8 781 782 TRANSPOSE_BLOCK 6 * 8, 6 * 256, 64 ; M54 783 SWAP_BLOCKS 6 * 8, 7 * 256, 64, 6 * 256, 7 * 8 784 785 TRANSPOSE_BLOCK 7 * 8, 7 * 256, 64 ; M63 786 787 ret 788%endmacro 789 790; void ff_hevc_idct_32x32_{8,10}_<opt>(int16_t *coeffs, int col_limit) 791; %1 = bitdepth 792%macro IDCT_32x32 1 793cglobal hevc_idct_32x32_%1, 1, 6, 16, 256, coeffs 794 mov r1d, 7 795.loop32: 796 TR_32x4 8 * r1, %1, 1 797 dec r1d 798 jge .loop32 799 800 call hevc_idct_transpose_32x32_ %+ cpuname 801 802 mov r1d, 7 803.loop32_2: 804 TR_32x4 8 * r1, %1, 0 805 dec r1d 806 jge .loop32_2 807 808 TAIL_CALL hevc_idct_transpose_32x32_ %+ cpuname, 1 809%endmacro 810 811%macro INIT_IDCT_DC 1 812INIT_MMX mmxext 813IDCT_DC_NL 4, %1 814IDCT_DC 8, 2, %1 815 816INIT_XMM sse2 817IDCT_DC_NL 8, %1 818IDCT_DC 16, 4, %1 819IDCT_DC 32, 16, %1 820 821%if HAVE_AVX2_EXTERNAL 822 INIT_YMM avx2 823 IDCT_DC 16, 2, %1 824 IDCT_DC 32, 8, %1 825%endif ;HAVE_AVX2_EXTERNAL 826%endmacro 827 828%macro INIT_IDCT 2 829INIT_XMM %2 830%if %1 == 8 831 TRANSPOSE_8x8 832 %if ARCH_X86_64 833 TRANSPOSE_16x16 834 TRANSPOSE_32x32 835 %endif 836%endif 837%if ARCH_X86_64 838 IDCT_32x32 %1 839 IDCT_16x16 %1 840%endif 841IDCT_8x8 %1 842IDCT_4x4 %1 843%endmacro 844 845INIT_IDCT_DC 8 846INIT_IDCT_DC 10 847INIT_IDCT_DC 12 848INIT_IDCT 8, sse2 849INIT_IDCT 8, avx 850INIT_IDCT 10, sse2 851INIT_IDCT 10, avx 852;INIT_IDCT 12, sse2 853;INIT_IDCT 12, avx 854