1;***************************************************************************** 2;* MMX/SSE2-optimized H.264 iDCT 3;***************************************************************************** 4;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt 5;* Copyright (C) 2003-2008 x264 project 6;* 7;* Authors: Laurent Aimar <fenrir@via.ecp.fr> 8;* Loren Merritt <lorenm@u.washington.edu> 9;* Holger Lubitz <hal@duncan.ol.sub.de> 10;* Min Chen <chenm001.163.com> 11;* 12;* This file is part of FFmpeg. 13;* 14;* FFmpeg is free software; you can redistribute it and/or 15;* modify it under the terms of the GNU Lesser General Public 16;* License as published by the Free Software Foundation; either 17;* version 2.1 of the License, or (at your option) any later version. 18;* 19;* FFmpeg is distributed in the hope that it will be useful, 20;* but WITHOUT ANY WARRANTY; without even the implied warranty of 21;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22;* Lesser General Public License for more details. 23;* 24;* You should have received a copy of the GNU Lesser General Public 25;* License along with FFmpeg; if not, write to the Free Software 26;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 27;***************************************************************************** 28 29%include "libavutil/x86/x86util.asm" 30 31SECTION_RODATA 32 33scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 34 db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 35 db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 36 db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 37 db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 38 db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 39 db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 40 db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 41 db 4+11*8, 5+11*8, 4+12*8, 5+12*8 42 db 6+11*8, 7+11*8, 6+12*8, 7+12*8 43 db 4+13*8, 5+13*8, 4+14*8, 5+14*8 44 db 6+13*8, 7+13*8, 6+14*8, 7+14*8 45%ifdef PIC 46%define npicregs 1 47%define scan8 picregq 48%else 49%define npicregs 0 50%define scan8 scan8_mem 51%endif 52 53cextern pw_32 54cextern pw_1 55 56SECTION .text 57 58; %1=uint8_t *dst, %2=int16_t *block, %3=int stride 59%macro IDCT4_ADD 3 60 ; Load dct coeffs 61 movq m0, [%2] 62 movq m1, [%2+8] 63 movq m2, [%2+16] 64 movq m3, [%2+24] 65 66 IDCT4_1D w, 0, 1, 2, 3, 4, 5 67 mova m6, [pw_32] 68 %if mmsize == 8 69 TRANSPOSE4x4W 0, 1, 2, 3, 4 70 %else 71 punpcklwd m0, m1 72 punpcklwd m2, m3 73 SBUTTERFLY dq, 0, 2, 4 74 MOVHL m1, m0 75 MOVHL m3, m2 76 %endif 77 paddw m0, m6 78 IDCT4_1D w, 0, 1, 2, 3, 4, 5 79 pxor m7, m7 80 movq [%2+ 0], m7 81 movq [%2+ 8], m7 82 movq [%2+16], m7 83 movq [%2+24], m7 84 85 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 86 lea %1, [%1+%3*2] 87 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3 88%endmacro 89 90INIT_MMX mmx 91; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride) 92cglobal h264_idct_add_8, 3, 3, 0 93 movsxdifnidn r2, r2d 94 IDCT4_ADD r0, r1, r2 95 RET 96 97%macro IDCT8_1D 2 98 psraw m0, m1, 1 99 SWAP 0, 1 100 psraw m4, m5, 1 101 paddw m4, m5 102 paddw m1, m0 103 paddw m4, m7 104 paddw m1, m5 105 psubw m4, m0 106 paddw m1, m3 107 108 psubw m0, m3 109 psubw m5, m3 110 psraw m3, 1 111 paddw m0, m7 112 psubw m5, m7 113 psraw m7, 1 114 psubw m0, m3 115 psubw m5, m7 116 117 psraw m7, m1, 2 118 SWAP 7,1 119 psraw m3, m4, 2 120 paddw m3, m0 121 psraw m0, 2 122 paddw m1, m5 123 psraw m5, 2 124 psubw m0, m4 125 psubw m7, m5 126 127 psraw m5, m6, 1 128 SWAP 5,6 129 psraw m4, m2, 1 130 paddw m6, m2 131 psubw m4, m5 132 133 mova m2, %1 134 mova m5, %2 135 SUMSUB_BA w, 5, 2 136 SUMSUB_BA w, 6, 5 137 SUMSUB_BA w, 4, 2 138 SUMSUB_BA w, 7, 6 139 SUMSUB_BA w, 0, 4 140 SUMSUB_BA w, 3, 2 141 SUMSUB_BA w, 1, 5 142 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 143%endmacro 144 145%macro IDCT8_1D_FULL 1 146 mova m7, [%1+112] 147 mova m6, [%1+ 96] 148 mova m5, [%1+ 80] 149 mova m3, [%1+ 48] 150 mova m2, [%1+ 32] 151 mova m1, [%1+ 16] 152 IDCT8_1D [%1], [%1+ 64] 153%endmacro 154 155; %1=int16_t *block, %2=int16_t *dstblock 156%macro IDCT8_ADD_MMX_START 2 157 IDCT8_1D_FULL %1 158 mova [%1], m7 159 TRANSPOSE4x4W 0, 1, 2, 3, 7 160 mova m7, [%1] 161 mova [%2 ], m0 162 mova [%2+16], m1 163 mova [%2+32], m2 164 mova [%2+48], m3 165 TRANSPOSE4x4W 4, 5, 6, 7, 3 166 mova [%2+ 8], m4 167 mova [%2+24], m5 168 mova [%2+40], m6 169 mova [%2+56], m7 170%endmacro 171 172; %1=uint8_t *dst, %2=int16_t *block, %3=int stride 173%macro IDCT8_ADD_MMX_END 3-4 174 IDCT8_1D_FULL %2 175 mova [%2 ], m5 176 mova [%2+16], m6 177 mova [%2+32], m7 178 179 pxor m7, m7 180%if %0 == 4 181 movq [%4+ 0], m7 182 movq [%4+ 8], m7 183 movq [%4+ 16], m7 184 movq [%4+ 24], m7 185 movq [%4+ 32], m7 186 movq [%4+ 40], m7 187 movq [%4+ 48], m7 188 movq [%4+ 56], m7 189 movq [%4+ 64], m7 190 movq [%4+ 72], m7 191 movq [%4+ 80], m7 192 movq [%4+ 88], m7 193 movq [%4+ 96], m7 194 movq [%4+104], m7 195 movq [%4+112], m7 196 movq [%4+120], m7 197%endif 198 STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 199 lea %1, [%1+%3*2] 200 STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 201 mova m0, [%2 ] 202 mova m1, [%2+16] 203 mova m2, [%2+32] 204 lea %1, [%1+%3*2] 205 STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3 206 lea %1, [%1+%3*2] 207 STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3 208%endmacro 209 210INIT_MMX mmx 211; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride) 212cglobal h264_idct8_add_8, 3, 4, 0 213 movsxdifnidn r2, r2d 214 %assign pad 128+4-(stack_offset&7) 215 SUB rsp, pad 216 217 add word [r1], 32 218 IDCT8_ADD_MMX_START r1 , rsp 219 IDCT8_ADD_MMX_START r1+8, rsp+64 220 lea r3, [r0+4] 221 IDCT8_ADD_MMX_END r0 , rsp, r2, r1 222 IDCT8_ADD_MMX_END r3 , rsp+8, r2 223 224 ADD rsp, pad 225 RET 226 227; %1=uint8_t *dst, %2=int16_t *block, %3=int stride 228%macro IDCT8_ADD_SSE 4 229 IDCT8_1D_FULL %2 230%if ARCH_X86_64 231 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 232%else 233 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16] 234%endif 235 paddw m0, [pw_32] 236 237%if ARCH_X86_64 == 0 238 mova [%2 ], m0 239 mova [%2+16], m4 240 IDCT8_1D [%2], [%2+ 16] 241 mova [%2 ], m6 242 mova [%2+16], m7 243%else 244 SWAP 0, 8 245 SWAP 4, 9 246 IDCT8_1D m8, m9 247 SWAP 6, 8 248 SWAP 7, 9 249%endif 250 251 pxor m7, m7 252 lea %4, [%3*3] 253 STORE_DIFF m0, m6, m7, [%1 ] 254 STORE_DIFF m1, m6, m7, [%1+%3 ] 255 STORE_DIFF m2, m6, m7, [%1+%3*2] 256 STORE_DIFF m3, m6, m7, [%1+%4 ] 257%if ARCH_X86_64 == 0 258 mova m0, [%2 ] 259 mova m1, [%2+16] 260%else 261 SWAP 0, 8 262 SWAP 1, 9 263%endif 264 mova [%2+ 0], m7 265 mova [%2+ 16], m7 266 mova [%2+ 32], m7 267 mova [%2+ 48], m7 268 mova [%2+ 64], m7 269 mova [%2+ 80], m7 270 mova [%2+ 96], m7 271 mova [%2+112], m7 272 lea %1, [%1+%3*4] 273 STORE_DIFF m4, m6, m7, [%1 ] 274 STORE_DIFF m5, m6, m7, [%1+%3 ] 275 STORE_DIFF m0, m6, m7, [%1+%3*2] 276 STORE_DIFF m1, m6, m7, [%1+%4 ] 277%endmacro 278 279INIT_XMM sse2 280; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride) 281cglobal h264_idct8_add_8, 3, 4, 10 282 movsxdifnidn r2, r2d 283 IDCT8_ADD_SSE r0, r1, r2, r3 284 RET 285 286%macro DC_ADD_MMXEXT_INIT 2 287 add %1, 32 288 sar %1, 6 289 movd m0, %1d 290 lea %1, [%2*3] 291 pshufw m0, m0, 0 292 pxor m1, m1 293 psubw m1, m0 294 packuswb m0, m0 295 packuswb m1, m1 296%endmacro 297 298%macro DC_ADD_MMXEXT_OP 4 299 %1 m2, [%2 ] 300 %1 m3, [%2+%3 ] 301 %1 m4, [%2+%3*2] 302 %1 m5, [%2+%4 ] 303 paddusb m2, m0 304 paddusb m3, m0 305 paddusb m4, m0 306 paddusb m5, m0 307 psubusb m2, m1 308 psubusb m3, m1 309 psubusb m4, m1 310 psubusb m5, m1 311 %1 [%2 ], m2 312 %1 [%2+%3 ], m3 313 %1 [%2+%3*2], m4 314 %1 [%2+%4 ], m5 315%endmacro 316 317INIT_MMX mmxext 318; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) 319%if ARCH_X86_64 320cglobal h264_idct_dc_add_8, 3, 4, 0 321 movsxd r2, r2d 322 movsx r3, word [r1] 323 mov dword [r1], 0 324 DC_ADD_MMXEXT_INIT r3, r2 325 DC_ADD_MMXEXT_OP movh, r0, r2, r3 326 RET 327 328; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) 329cglobal h264_idct8_dc_add_8, 3, 4, 0 330 movsxd r2, r2d 331 movsx r3, word [r1] 332 mov dword [r1], 0 333 DC_ADD_MMXEXT_INIT r3, r2 334 DC_ADD_MMXEXT_OP mova, r0, r2, r3 335 lea r0, [r0+r2*4] 336 DC_ADD_MMXEXT_OP mova, r0, r2, r3 337 RET 338%else 339; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) 340cglobal h264_idct_dc_add_8, 2, 3, 0 341 movsx r2, word [r1] 342 mov dword [r1], 0 343 mov r1, r2m 344 DC_ADD_MMXEXT_INIT r2, r1 345 DC_ADD_MMXEXT_OP movh, r0, r1, r2 346 RET 347 348; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) 349cglobal h264_idct8_dc_add_8, 2, 3, 0 350 movsx r2, word [r1] 351 mov dword [r1], 0 352 mov r1, r2m 353 DC_ADD_MMXEXT_INIT r2, r1 354 DC_ADD_MMXEXT_OP mova, r0, r1, r2 355 lea r0, [r0+r1*4] 356 DC_ADD_MMXEXT_OP mova, r0, r1, r2 357 RET 358%endif 359 360INIT_MMX mmx 361; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset, 362; int16_t *block, int stride, 363; const uint8_t nnzc[6 * 8]) 364cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg 365 movsxdifnidn r3, r3d 366 xor r5, r5 367%ifdef PIC 368 lea picregq, [scan8_mem] 369%endif 370.nextblock: 371 movzx r6, byte [scan8+r5] 372 movzx r6, byte [r4+r6] 373 test r6, r6 374 jz .skipblock 375 mov r6d, dword [r1+r5*4] 376 lea r6, [r0+r6] 377 IDCT4_ADD r6, r2, r3 378.skipblock: 379 inc r5 380 add r2, 32 381 cmp r5, 16 382 jl .nextblock 383 REP_RET 384 385; void ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset, 386; int16_t *block, int stride, 387; const uint8_t nnzc[6 * 8]) 388cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg 389 movsxdifnidn r3, r3d 390 %assign pad 128+4-(stack_offset&7) 391 SUB rsp, pad 392 393 xor r5, r5 394%ifdef PIC 395 lea picregq, [scan8_mem] 396%endif 397.nextblock: 398 movzx r6, byte [scan8+r5] 399 movzx r6, byte [r4+r6] 400 test r6, r6 401 jz .skipblock 402 mov r6d, dword [r1+r5*4] 403 add r6, r0 404 add word [r2], 32 405 IDCT8_ADD_MMX_START r2 , rsp 406 IDCT8_ADD_MMX_START r2+8, rsp+64 407 IDCT8_ADD_MMX_END r6 , rsp, r3, r2 408 mov r6d, dword [r1+r5*4] 409 lea r6, [r0+r6+4] 410 IDCT8_ADD_MMX_END r6 , rsp+8, r3 411.skipblock: 412 add r5, 4 413 add r2, 128 414 cmp r5, 16 415 jl .nextblock 416 ADD rsp, pad 417 RET 418 419INIT_MMX mmxext 420; void ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset, 421; int16_t *block, int stride, 422; const uint8_t nnzc[6 * 8]) 423cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 424 movsxdifnidn r3, r3d 425 xor r5, r5 426%ifdef PIC 427 lea picregq, [scan8_mem] 428%endif 429.nextblock: 430 movzx r6, byte [scan8+r5] 431 movzx r6, byte [r4+r6] 432 test r6, r6 433 jz .skipblock 434 cmp r6, 1 435 jnz .no_dc 436 movsx r6, word [r2] 437 test r6, r6 438 jz .no_dc 439 mov word [r2], 0 440 DC_ADD_MMXEXT_INIT r6, r3 441%if ARCH_X86_64 == 0 442%define dst2q r1 443%define dst2d r1d 444%endif 445 mov dst2d, dword [r1+r5*4] 446 lea dst2q, [r0+dst2q] 447 DC_ADD_MMXEXT_OP movh, dst2q, r3, r6 448%if ARCH_X86_64 == 0 449 mov r1, r1m 450%endif 451 inc r5 452 add r2, 32 453 cmp r5, 16 454 jl .nextblock 455 REP_RET 456.no_dc: 457 mov r6d, dword [r1+r5*4] 458 add r6, r0 459 IDCT4_ADD r6, r2, r3 460.skipblock: 461 inc r5 462 add r2, 32 463 cmp r5, 16 464 jl .nextblock 465 REP_RET 466 467INIT_MMX mmx 468; void ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset, 469; int16_t *block, int stride, 470; const uint8_t nnzc[6 * 8]) 471cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg 472 movsxdifnidn r3, r3d 473 xor r5, r5 474%ifdef PIC 475 lea picregq, [scan8_mem] 476%endif 477.nextblock: 478 movzx r6, byte [scan8+r5] 479 movzx r6, byte [r4+r6] 480 or r6w, word [r2] 481 test r6, r6 482 jz .skipblock 483 mov r6d, dword [r1+r5*4] 484 add r6, r0 485 IDCT4_ADD r6, r2, r3 486.skipblock: 487 inc r5 488 add r2, 32 489 cmp r5, 16 490 jl .nextblock 491 REP_RET 492 493INIT_MMX mmxext 494; void ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset, 495; int16_t *block, int stride, 496; const uint8_t nnzc[6 * 8]) 497cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 498 movsxdifnidn r3, r3d 499 xor r5, r5 500%ifdef PIC 501 lea picregq, [scan8_mem] 502%endif 503.nextblock: 504 movzx r6, byte [scan8+r5] 505 movzx r6, byte [r4+r6] 506 test r6, r6 507 jz .try_dc 508 mov r6d, dword [r1+r5*4] 509 lea r6, [r0+r6] 510 IDCT4_ADD r6, r2, r3 511 inc r5 512 add r2, 32 513 cmp r5, 16 514 jl .nextblock 515 REP_RET 516.try_dc: 517 movsx r6, word [r2] 518 test r6, r6 519 jz .skipblock 520 mov word [r2], 0 521 DC_ADD_MMXEXT_INIT r6, r3 522%if ARCH_X86_64 == 0 523%define dst2q r1 524%define dst2d r1d 525%endif 526 mov dst2d, dword [r1+r5*4] 527 add dst2q, r0 528 DC_ADD_MMXEXT_OP movh, dst2q, r3, r6 529%if ARCH_X86_64 == 0 530 mov r1, r1m 531%endif 532.skipblock: 533 inc r5 534 add r2, 32 535 cmp r5, 16 536 jl .nextblock 537 REP_RET 538 539; void ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset, 540; int16_t *block, int stride, 541; const uint8_t nnzc[6 * 8]) 542cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 543 movsxdifnidn r3, r3d 544 %assign pad 128+4-(stack_offset&7) 545 SUB rsp, pad 546 547 xor r5, r5 548%ifdef PIC 549 lea picregq, [scan8_mem] 550%endif 551.nextblock: 552 movzx r6, byte [scan8+r5] 553 movzx r6, byte [r4+r6] 554 test r6, r6 555 jz .skipblock 556 cmp r6, 1 557 jnz .no_dc 558 movsx r6, word [r2] 559 test r6, r6 560 jz .no_dc 561 mov word [r2], 0 562 DC_ADD_MMXEXT_INIT r6, r3 563%if ARCH_X86_64 == 0 564%define dst2q r1 565%define dst2d r1d 566%endif 567 mov dst2d, dword [r1+r5*4] 568 lea dst2q, [r0+dst2q] 569 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 570 lea dst2q, [dst2q+r3*4] 571 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 572%if ARCH_X86_64 == 0 573 mov r1, r1m 574%endif 575 add r5, 4 576 add r2, 128 577 cmp r5, 16 578 jl .nextblock 579 580 ADD rsp, pad 581 RET 582.no_dc: 583 mov r6d, dword [r1+r5*4] 584 add r6, r0 585 add word [r2], 32 586 IDCT8_ADD_MMX_START r2 , rsp 587 IDCT8_ADD_MMX_START r2+8, rsp+64 588 IDCT8_ADD_MMX_END r6 , rsp, r3, r2 589 mov r6d, dword [r1+r5*4] 590 lea r6, [r0+r6+4] 591 IDCT8_ADD_MMX_END r6 , rsp+8, r3 592.skipblock: 593 add r5, 4 594 add r2, 128 595 cmp r5, 16 596 jl .nextblock 597 598 ADD rsp, pad 599 RET 600 601INIT_XMM sse2 602; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset, 603; int16_t *block, int stride, 604; const uint8_t nnzc[6 * 8]) 605cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 606 movsxdifnidn r3, r3d 607 xor r5, r5 608%ifdef PIC 609 lea picregq, [scan8_mem] 610%endif 611.nextblock: 612 movzx r6, byte [scan8+r5] 613 movzx r6, byte [r4+r6] 614 test r6, r6 615 jz .skipblock 616 cmp r6, 1 617 jnz .no_dc 618 movsx r6, word [r2] 619 test r6, r6 620 jz .no_dc 621INIT_MMX cpuname 622 mov word [r2], 0 623 DC_ADD_MMXEXT_INIT r6, r3 624%if ARCH_X86_64 == 0 625%define dst2q r1 626%define dst2d r1d 627%endif 628 mov dst2d, dword [r1+r5*4] 629 add dst2q, r0 630 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 631 lea dst2q, [dst2q+r3*4] 632 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 633%if ARCH_X86_64 == 0 634 mov r1, r1m 635%endif 636 add r5, 4 637 add r2, 128 638 cmp r5, 16 639 jl .nextblock 640 REP_RET 641.no_dc: 642INIT_XMM cpuname 643 mov dst2d, dword [r1+r5*4] 644 add dst2q, r0 645 IDCT8_ADD_SSE dst2q, r2, r3, r6 646%if ARCH_X86_64 == 0 647 mov r1, r1m 648%endif 649.skipblock: 650 add r5, 4 651 add r2, 128 652 cmp r5, 16 653 jl .nextblock 654 REP_RET 655 656INIT_MMX mmx 657h264_idct_add8_mmx_plane: 658 movsxdifnidn r3, r3d 659.nextblock: 660 movzx r6, byte [scan8+r5] 661 movzx r6, byte [r4+r6] 662 or r6w, word [r2] 663 test r6, r6 664 jz .skipblock 665%if ARCH_X86_64 666 mov r0d, dword [r1+r5*4] 667 add r0, [dst2q] 668%else 669 mov r0, r1m ; XXX r1m here is actually r0m of the calling func 670 mov r0, [r0] 671 add r0, dword [r1+r5*4] 672%endif 673 IDCT4_ADD r0, r2, r3 674.skipblock: 675 inc r5 676 add r2, 32 677 test r5, 3 678 jnz .nextblock 679 rep ret 680 681; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset, 682; int16_t *block, int stride, 683; const uint8_t nnzc[6 * 8]) 684cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 685 movsxdifnidn r3, r3d 686 mov r5, 16 687 add r2, 512 688%ifdef PIC 689 lea picregq, [scan8_mem] 690%endif 691%if ARCH_X86_64 692 mov dst2q, r0 693%endif 694 call h264_idct_add8_mmx_plane 695 mov r5, 32 696 add r2, 384 697%if ARCH_X86_64 698 add dst2q, gprsize 699%else 700 add r0mp, gprsize 701%endif 702 call h264_idct_add8_mmx_plane 703 RET ; TODO: check rep ret after a function call 704 705cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 706; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 707 movsxdifnidn r3, r3d 708%ifdef PIC 709 lea picregq, [scan8_mem] 710%endif 711%if ARCH_X86_64 712 mov dst2q, r0 713%endif 714 715 mov r5, 16 ; i 716 add r2, 512 ; i * 16 * sizeof(dctcoef) ; #define dctcoef int16_t 717 718 call h264_idct_add8_mmx_plane 719 add r5, 4 720 call h264_idct_add8_mmx_plane 721 722%if ARCH_X86_64 723 add dst2q, gprsize ; dest[1] 724%else 725 add r0mp, gprsize 726%endif 727 728 add r5, 4 ; set to 32 729 add r2, 256 ; set to i * 16 * sizeof(dctcoef) 730 731 call h264_idct_add8_mmx_plane 732 add r5, 4 733 call h264_idct_add8_mmx_plane 734 735 RET ; TODO: check rep ret after a function call 736 737h264_idct_add8_mmxext_plane: 738 movsxdifnidn r3, r3d 739.nextblock: 740 movzx r6, byte [scan8+r5] 741 movzx r6, byte [r4+r6] 742 test r6, r6 743 jz .try_dc 744%if ARCH_X86_64 745 mov r0d, dword [r1+r5*4] 746 add r0, [dst2q] 747%else 748 mov r0, r1m ; XXX r1m here is actually r0m of the calling func 749 mov r0, [r0] 750 add r0, dword [r1+r5*4] 751%endif 752 IDCT4_ADD r0, r2, r3 753 inc r5 754 add r2, 32 755 test r5, 3 756 jnz .nextblock 757 rep ret 758.try_dc: 759 movsx r6, word [r2] 760 test r6, r6 761 jz .skipblock 762 mov word [r2], 0 763 DC_ADD_MMXEXT_INIT r6, r3 764%if ARCH_X86_64 765 mov r0d, dword [r1+r5*4] 766 add r0, [dst2q] 767%else 768 mov r0, r1m ; XXX r1m here is actually r0m of the calling func 769 mov r0, [r0] 770 add r0, dword [r1+r5*4] 771%endif 772 DC_ADD_MMXEXT_OP movh, r0, r3, r6 773.skipblock: 774 inc r5 775 add r2, 32 776 test r5, 3 777 jnz .nextblock 778 rep ret 779 780INIT_MMX mmxext 781; void ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset, 782; int16_t *block, int stride, 783; const uint8_t nnzc[6 * 8]) 784cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 785 movsxdifnidn r3, r3d 786 mov r5, 16 787 add r2, 512 788%if ARCH_X86_64 789 mov dst2q, r0 790%endif 791%ifdef PIC 792 lea picregq, [scan8_mem] 793%endif 794 call h264_idct_add8_mmxext_plane 795 mov r5, 32 796 add r2, 384 797%if ARCH_X86_64 798 add dst2q, gprsize 799%else 800 add r0mp, gprsize 801%endif 802 call h264_idct_add8_mmxext_plane 803 RET ; TODO: check rep ret after a function call 804 805; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered 806h264_idct_dc_add8_mmxext: 807 movsxdifnidn r3, r3d 808 movd m0, [r2 ] ; 0 0 X D 809 mov word [r2+ 0], 0 810 punpcklwd m0, [r2+32] ; x X d D 811 mov word [r2+32], 0 812 paddsw m0, [pw_32] 813 psraw m0, 6 814 punpcklwd m0, m0 ; d d D D 815 pxor m1, m1 ; 0 0 0 0 816 psubw m1, m0 ; -d-d-D-D 817 packuswb m0, m1 ; -d-d-D-D d d D D 818 pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D 819 punpcklwd m0, m0 ; d d d d D D D D 820 lea r6, [r3*3] 821 DC_ADD_MMXEXT_OP movq, r0, r3, r6 822 ret 823 824ALIGN 16 825INIT_XMM sse2 826; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride 827h264_add8x4_idct_sse2: 828 movsxdifnidn r3, r3d 829 movq m0, [r2+ 0] 830 movq m1, [r2+ 8] 831 movq m2, [r2+16] 832 movq m3, [r2+24] 833 movhps m0, [r2+32] 834 movhps m1, [r2+40] 835 movhps m2, [r2+48] 836 movhps m3, [r2+56] 837 IDCT4_1D w,0,1,2,3,4,5 838 TRANSPOSE2x4x4W 0,1,2,3,4 839 paddw m0, [pw_32] 840 IDCT4_1D w,0,1,2,3,4,5 841 pxor m7, m7 842 mova [r2+ 0], m7 843 mova [r2+16], m7 844 mova [r2+32], m7 845 mova [r2+48], m7 846 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 847 lea r0, [r0+r3*2] 848 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 849 ret 850 851%macro add16_sse2_cycle 2 852 movzx r0, word [r4+%2] 853 test r0, r0 854 jz .cycle%1end 855 mov r0d, dword [r1+%1*8] 856%if ARCH_X86_64 857 add r0, r5 858%else 859 add r0, r0m 860%endif 861 call h264_add8x4_idct_sse2 862.cycle%1end: 863%if %1 < 7 864 add r2, 64 865%endif 866%endmacro 867 868; void ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset, 869; int16_t *block, int stride, 870; const uint8_t nnzc[6 * 8]) 871cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8 872 movsxdifnidn r3, r3d 873%if ARCH_X86_64 874 mov r5, r0 875%endif 876 ; unrolling of the loop leads to an average performance gain of 877 ; 20-25% 878 add16_sse2_cycle 0, 0xc 879 add16_sse2_cycle 1, 0x14 880 add16_sse2_cycle 2, 0xe 881 add16_sse2_cycle 3, 0x16 882 add16_sse2_cycle 4, 0x1c 883 add16_sse2_cycle 5, 0x24 884 add16_sse2_cycle 6, 0x1e 885 add16_sse2_cycle 7, 0x26 886REP_RET 887 888%macro add16intra_sse2_cycle 2 889 movzx r0, word [r4+%2] 890 test r0, r0 891 jz .try%1dc 892 mov r0d, dword [r1+%1*8] 893%if ARCH_X86_64 894 add r0, r7 895%else 896 add r0, r0m 897%endif 898 call h264_add8x4_idct_sse2 899 jmp .cycle%1end 900.try%1dc: 901 movsx r0, word [r2 ] 902 or r0w, word [r2+32] 903 jz .cycle%1end 904 mov r0d, dword [r1+%1*8] 905%if ARCH_X86_64 906 add r0, r7 907%else 908 add r0, r0m 909%endif 910 call h264_idct_dc_add8_mmxext 911.cycle%1end: 912%if %1 < 7 913 add r2, 64 914%endif 915%endmacro 916 917; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset, 918; int16_t *block, int stride, 919; const uint8_t nnzc[6 * 8]) 920cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8 921 movsxdifnidn r3, r3d 922%if ARCH_X86_64 923 mov r7, r0 924%endif 925 add16intra_sse2_cycle 0, 0xc 926 add16intra_sse2_cycle 1, 0x14 927 add16intra_sse2_cycle 2, 0xe 928 add16intra_sse2_cycle 3, 0x16 929 add16intra_sse2_cycle 4, 0x1c 930 add16intra_sse2_cycle 5, 0x24 931 add16intra_sse2_cycle 6, 0x1e 932 add16intra_sse2_cycle 7, 0x26 933REP_RET 934 935%macro add8_sse2_cycle 2 936 movzx r0, word [r4+%2] 937 test r0, r0 938 jz .try%1dc 939%if ARCH_X86_64 940 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] 941 add r0, [r7] 942%else 943 mov r0, r0m 944 mov r0, [r0] 945 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] 946%endif 947 call h264_add8x4_idct_sse2 948 jmp .cycle%1end 949.try%1dc: 950 movsx r0, word [r2 ] 951 or r0w, word [r2+32] 952 jz .cycle%1end 953%if ARCH_X86_64 954 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] 955 add r0, [r7] 956%else 957 mov r0, r0m 958 mov r0, [r0] 959 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] 960%endif 961 call h264_idct_dc_add8_mmxext 962.cycle%1end: 963%if %1 == 1 964 add r2, 384+64 965%elif %1 < 3 966 add r2, 64 967%endif 968%endmacro 969 970; void ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset, 971; int16_t *block, int stride, 972; const uint8_t nnzc[6 * 8]) 973cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8 974 movsxdifnidn r3, r3d 975 add r2, 512 976%if ARCH_X86_64 977 mov r7, r0 978%endif 979 add8_sse2_cycle 0, 0x34 980 add8_sse2_cycle 1, 0x3c 981%if ARCH_X86_64 982 add r7, gprsize 983%else 984 add r0mp, gprsize 985%endif 986 add8_sse2_cycle 2, 0x5c 987 add8_sse2_cycle 3, 0x64 988REP_RET 989 990;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul) 991 992%macro WALSH4_1D 5 993 SUMSUB_BADC w, %4, %3, %2, %1, %5 994 SUMSUB_BADC w, %4, %2, %3, %1, %5 995 SWAP %1, %4, %3 996%endmacro 997 998%macro DEQUANT 1-3 999%if cpuflag(sse2) 1000 movd xmm4, t3d 1001 movq xmm5, [pw_1] 1002 pshufd xmm4, xmm4, 0 1003 movq2dq xmm0, m0 1004 movq2dq xmm1, m1 1005 movq2dq xmm2, m2 1006 movq2dq xmm3, m3 1007 punpcklwd xmm0, xmm5 1008 punpcklwd xmm1, xmm5 1009 punpcklwd xmm2, xmm5 1010 punpcklwd xmm3, xmm5 1011 pmaddwd xmm0, xmm4 1012 pmaddwd xmm1, xmm4 1013 pmaddwd xmm2, xmm4 1014 pmaddwd xmm3, xmm4 1015 psrad xmm0, %1 1016 psrad xmm1, %1 1017 psrad xmm2, %1 1018 psrad xmm3, %1 1019 packssdw xmm0, xmm1 1020 packssdw xmm2, xmm3 1021%else 1022 mova m7, [pw_1] 1023 mova m4, %1 1024 punpcklwd %1, m7 1025 punpckhwd m4, m7 1026 mova m5, %2 1027 punpcklwd %2, m7 1028 punpckhwd m5, m7 1029 movd m7, t3d 1030 punpckldq m7, m7 1031 pmaddwd %1, m7 1032 pmaddwd %2, m7 1033 pmaddwd m4, m7 1034 pmaddwd m5, m7 1035 psrad %1, %3 1036 psrad %2, %3 1037 psrad m4, %3 1038 psrad m5, %3 1039 packssdw %1, m4 1040 packssdw %2, m5 1041%endif 1042%endmacro 1043 1044%macro STORE_WORDS 5-9 1045%if cpuflag(sse) 1046 movd t0d, %1 1047 psrldq %1, 4 1048 movd t1d, %1 1049 psrldq %1, 4 1050 mov [t2+%2*32], t0w 1051 mov [t2+%4*32], t1w 1052 shr t0d, 16 1053 shr t1d, 16 1054 mov [t2+%3*32], t0w 1055 mov [t2+%5*32], t1w 1056 movd t0d, %1 1057 psrldq %1, 4 1058 movd t1d, %1 1059 mov [t2+%6*32], t0w 1060 mov [t2+%8*32], t1w 1061 shr t0d, 16 1062 shr t1d, 16 1063 mov [t2+%7*32], t0w 1064 mov [t2+%9*32], t1w 1065%else 1066 movd t0d, %1 1067 psrlq %1, 32 1068 movd t1d, %1 1069 mov [t2+%2*32], t0w 1070 mov [t2+%4*32], t1w 1071 shr t0d, 16 1072 shr t1d, 16 1073 mov [t2+%3*32], t0w 1074 mov [t2+%5*32], t1w 1075%endif 1076%endmacro 1077 1078%macro DEQUANT_STORE 1 1079%if cpuflag(sse2) 1080 DEQUANT %1 1081 STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7 1082 STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15 1083%else 1084 DEQUANT m0, m1, %1 1085 STORE_WORDS m0, 0, 1, 4, 5 1086 STORE_WORDS m1, 2, 3, 6, 7 1087 1088 DEQUANT m2, m3, %1 1089 STORE_WORDS m2, 8, 9, 12, 13 1090 STORE_WORDS m3, 10, 11, 14, 15 1091%endif 1092%endmacro 1093 1094%macro IDCT_DC_DEQUANT 1 1095cglobal h264_luma_dc_dequant_idct, 3, 4, %1 1096 ; manually spill XMM registers for Win64 because 1097 ; the code here is initialized with INIT_MMX 1098 WIN64_SPILL_XMM %1 1099 movq m3, [r1+24] 1100 movq m2, [r1+16] 1101 movq m1, [r1+ 8] 1102 movq m0, [r1+ 0] 1103 WALSH4_1D 0,1,2,3,4 1104 TRANSPOSE4x4W 0,1,2,3,4 1105 WALSH4_1D 0,1,2,3,4 1106 1107; shift, tmp, output, qmul 1108%if WIN64 1109 DECLARE_REG_TMP 0,3,1,2 1110 ; we can't avoid this, because r0 is the shift register (ecx) on win64 1111 xchg r0, t2 1112%elif ARCH_X86_64 1113 DECLARE_REG_TMP 3,1,0,2 1114%else 1115 DECLARE_REG_TMP 1,3,0,2 1116%endif 1117 1118 cmp t3d, 32767 1119 jg .big_qmul 1120 add t3d, 128 << 16 1121 DEQUANT_STORE 8 1122 RET 1123.big_qmul: 1124 bsr t0d, t3d 1125 add t3d, 128 << 16 1126 mov t1d, 7 1127 cmp t0d, t1d 1128 cmovg t0d, t1d 1129 inc t1d 1130 shr t3d, t0b 1131 sub t1d, t0d 1132%if cpuflag(sse2) 1133 movd xmm6, t1d 1134 DEQUANT_STORE xmm6 1135%else 1136 movd m6, t1d 1137 DEQUANT_STORE m6 1138%endif 1139 RET 1140%endmacro 1141 1142INIT_MMX mmx 1143IDCT_DC_DEQUANT 0 1144INIT_MMX sse2 1145IDCT_DC_DEQUANT 7 1146 1147%ifdef __NASM_VER__ 1148%if __NASM_MAJOR__ >= 2 && __NASM_MINOR__ >= 4 1149%unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet 1150%endif 1151%endif 1152%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride 1153 movd %3, [%7] 1154 movd %4, [%7+%8] 1155 psraw %1, %6 1156 psraw %2, %6 1157 punpcklbw %3, %5 1158 punpcklbw %4, %5 1159 paddw %3, %1 1160 paddw %4, %2 1161 packuswb %3, %5 1162 packuswb %4, %5 1163 movd [%7], %3 1164 movd [%7+%8], %4 1165%endmacro 1166 1167%macro DC_ADD_INIT 1 1168 add %1d, 32 1169 sar %1d, 6 1170 movd m0, %1d 1171 pshuflw m0, m0, 0 1172 lea %1, [3*stride_q] 1173 pxor m1, m1 1174 psubw m1, m0 1175 packuswb m0, m0 1176 packuswb m1, m1 1177%endmacro 1178 1179%macro IDCT_XMM 1 1180 1181INIT_XMM %1 1182 1183cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ 1184 movsxdifnidn stride_q, stride_d 1185 IDCT4_ADD dst_q, block_q, stride_q 1186RET 1187 1188cglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_ 1189 movsxdifnidn stride_q, stride_d 1190 movsx r3d, word [block_q] 1191 mov dword [block_q], 0 1192 DC_ADD_INIT r3 1193 DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3 1194RET 1195 1196%endmacro 1197 1198IDCT_XMM sse2 1199IDCT_XMM avx 1200