1;***************************************************************************** 2;* MMX/SSE2-optimized H.264 iDCT 3;***************************************************************************** 4;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt 5;* Copyright (C) 2003-2008 x264 project 6;* 7;* Authors: Laurent Aimar <fenrir@via.ecp.fr> 8;* Loren Merritt <lorenm@u.washington.edu> 9;* Holger Lubitz <hal@duncan.ol.sub.de> 10;* Min Chen <chenm001.163.com> 11;* 12;* This file is part of FFmpeg. 13;* 14;* FFmpeg is free software; you can redistribute it and/or 15;* modify it under the terms of the GNU Lesser General Public 16;* License as published by the Free Software Foundation; either 17;* version 2.1 of the License, or (at your option) any later version. 18;* 19;* FFmpeg is distributed in the hope that it will be useful, 20;* but WITHOUT ANY WARRANTY; without even the implied warranty of 21;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22;* Lesser General Public License for more details. 23;* 24;* You should have received a copy of the GNU Lesser General Public 25;* License along with FFmpeg; if not, write to the Free Software 26;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 27;***************************************************************************** 28 29%include "libavutil/x86/x86util.asm" 30 31SECTION_RODATA 32 33scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 34 db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 35 db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 36 db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 37 db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 38 db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 39 db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 40 db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 41 db 4+11*8, 5+11*8, 4+12*8, 5+12*8 42 db 6+11*8, 7+11*8, 6+12*8, 7+12*8 43 db 4+13*8, 5+13*8, 4+14*8, 5+14*8 44 db 6+13*8, 7+13*8, 6+14*8, 7+14*8 45%ifdef PIC 46%define npicregs 1 47%define scan8 picregq 48%else 49%define npicregs 0 50%define scan8 scan8_mem 51%endif 52 53cextern pw_32 54cextern pw_1 55 56SECTION .text 57 58; %1=uint8_t *dst, %2=int16_t *block, %3=int stride 59%macro IDCT4_ADD 3 60 ; Load dct coeffs 61 movq m0, [%2] 62 movq m1, [%2+8] 63 movq m2, [%2+16] 64 movq m3, [%2+24] 65 66 IDCT4_1D w, 0, 1, 2, 3, 4, 5 67 mova m6, [pw_32] 68 TRANSPOSE4x4W 0, 1, 2, 3, 4 69 paddw m0, m6 70 IDCT4_1D w, 0, 1, 2, 3, 4, 5 71 pxor m7, m7 72 movq [%2+ 0], m7 73 movq [%2+ 8], m7 74 movq [%2+16], m7 75 movq [%2+24], m7 76 77 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 78 lea %1, [%1+%3*2] 79 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3 80%endmacro 81 82INIT_MMX mmx 83; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride) 84cglobal h264_idct_add_8, 3, 3, 0 85 IDCT4_ADD r0, r1, r2 86 RET 87 88%macro IDCT8_1D 2 89 mova m0, m1 90 psraw m1, 1 91 mova m4, m5 92 psraw m4, 1 93 paddw m4, m5 94 paddw m1, m0 95 paddw m4, m7 96 paddw m1, m5 97 psubw m4, m0 98 paddw m1, m3 99 100 psubw m0, m3 101 psubw m5, m3 102 psraw m3, 1 103 paddw m0, m7 104 psubw m5, m7 105 psraw m7, 1 106 psubw m0, m3 107 psubw m5, m7 108 109 mova m7, m1 110 psraw m1, 2 111 mova m3, m4 112 psraw m3, 2 113 paddw m3, m0 114 psraw m0, 2 115 paddw m1, m5 116 psraw m5, 2 117 psubw m0, m4 118 psubw m7, m5 119 120 mova m5, m6 121 psraw m6, 1 122 mova m4, m2 123 psraw m4, 1 124 paddw m6, m2 125 psubw m4, m5 126 127 mova m2, %1 128 mova m5, %2 129 SUMSUB_BA w, 5, 2 130 SUMSUB_BA w, 6, 5 131 SUMSUB_BA w, 4, 2 132 SUMSUB_BA w, 7, 6 133 SUMSUB_BA w, 0, 4 134 SUMSUB_BA w, 3, 2 135 SUMSUB_BA w, 1, 5 136 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 137%endmacro 138 139%macro IDCT8_1D_FULL 1 140 mova m7, [%1+112] 141 mova m6, [%1+ 96] 142 mova m5, [%1+ 80] 143 mova m3, [%1+ 48] 144 mova m2, [%1+ 32] 145 mova m1, [%1+ 16] 146 IDCT8_1D [%1], [%1+ 64] 147%endmacro 148 149; %1=int16_t *block, %2=int16_t *dstblock 150%macro IDCT8_ADD_MMX_START 2 151 IDCT8_1D_FULL %1 152 mova [%1], m7 153 TRANSPOSE4x4W 0, 1, 2, 3, 7 154 mova m7, [%1] 155 mova [%2 ], m0 156 mova [%2+16], m1 157 mova [%2+32], m2 158 mova [%2+48], m3 159 TRANSPOSE4x4W 4, 5, 6, 7, 3 160 mova [%2+ 8], m4 161 mova [%2+24], m5 162 mova [%2+40], m6 163 mova [%2+56], m7 164%endmacro 165 166; %1=uint8_t *dst, %2=int16_t *block, %3=int stride 167%macro IDCT8_ADD_MMX_END 3-4 168 IDCT8_1D_FULL %2 169 mova [%2 ], m5 170 mova [%2+16], m6 171 mova [%2+32], m7 172 173 pxor m7, m7 174%if %0 == 4 175 movq [%4+ 0], m7 176 movq [%4+ 8], m7 177 movq [%4+ 16], m7 178 movq [%4+ 24], m7 179 movq [%4+ 32], m7 180 movq [%4+ 40], m7 181 movq [%4+ 48], m7 182 movq [%4+ 56], m7 183 movq [%4+ 64], m7 184 movq [%4+ 72], m7 185 movq [%4+ 80], m7 186 movq [%4+ 88], m7 187 movq [%4+ 96], m7 188 movq [%4+104], m7 189 movq [%4+112], m7 190 movq [%4+120], m7 191%endif 192 STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 193 lea %1, [%1+%3*2] 194 STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 195 mova m0, [%2 ] 196 mova m1, [%2+16] 197 mova m2, [%2+32] 198 lea %1, [%1+%3*2] 199 STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3 200 lea %1, [%1+%3*2] 201 STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3 202%endmacro 203 204INIT_MMX mmx 205; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride) 206cglobal h264_idct8_add_8, 3, 4, 0 207 %assign pad 128+4-(stack_offset&7) 208 SUB rsp, pad 209 210 add word [r1], 32 211 IDCT8_ADD_MMX_START r1 , rsp 212 IDCT8_ADD_MMX_START r1+8, rsp+64 213 lea r3, [r0+4] 214 IDCT8_ADD_MMX_END r0 , rsp, r2, r1 215 IDCT8_ADD_MMX_END r3 , rsp+8, r2 216 217 ADD rsp, pad 218 RET 219 220; %1=uint8_t *dst, %2=int16_t *block, %3=int stride 221%macro IDCT8_ADD_SSE 4 222 IDCT8_1D_FULL %2 223%if ARCH_X86_64 224 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 225%else 226 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16] 227%endif 228 paddw m0, [pw_32] 229 230%if ARCH_X86_64 == 0 231 mova [%2 ], m0 232 mova [%2+16], m4 233 IDCT8_1D [%2], [%2+ 16] 234 mova [%2 ], m6 235 mova [%2+16], m7 236%else 237 SWAP 0, 8 238 SWAP 4, 9 239 IDCT8_1D m8, m9 240 SWAP 6, 8 241 SWAP 7, 9 242%endif 243 244 pxor m7, m7 245 lea %4, [%3*3] 246 STORE_DIFF m0, m6, m7, [%1 ] 247 STORE_DIFF m1, m6, m7, [%1+%3 ] 248 STORE_DIFF m2, m6, m7, [%1+%3*2] 249 STORE_DIFF m3, m6, m7, [%1+%4 ] 250%if ARCH_X86_64 == 0 251 mova m0, [%2 ] 252 mova m1, [%2+16] 253%else 254 SWAP 0, 8 255 SWAP 1, 9 256%endif 257 mova [%2+ 0], m7 258 mova [%2+ 16], m7 259 mova [%2+ 32], m7 260 mova [%2+ 48], m7 261 mova [%2+ 64], m7 262 mova [%2+ 80], m7 263 mova [%2+ 96], m7 264 mova [%2+112], m7 265 lea %1, [%1+%3*4] 266 STORE_DIFF m4, m6, m7, [%1 ] 267 STORE_DIFF m5, m6, m7, [%1+%3 ] 268 STORE_DIFF m0, m6, m7, [%1+%3*2] 269 STORE_DIFF m1, m6, m7, [%1+%4 ] 270%endmacro 271 272INIT_XMM sse2 273; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride) 274cglobal h264_idct8_add_8, 3, 4, 10 275 IDCT8_ADD_SSE r0, r1, r2, r3 276 RET 277 278%macro DC_ADD_MMXEXT_INIT 2 279 add %1, 32 280 sar %1, 6 281 movd m0, %1d 282 lea %1, [%2*3] 283 pshufw m0, m0, 0 284 pxor m1, m1 285 psubw m1, m0 286 packuswb m0, m0 287 packuswb m1, m1 288%endmacro 289 290%macro DC_ADD_MMXEXT_OP 4 291 %1 m2, [%2 ] 292 %1 m3, [%2+%3 ] 293 %1 m4, [%2+%3*2] 294 %1 m5, [%2+%4 ] 295 paddusb m2, m0 296 paddusb m3, m0 297 paddusb m4, m0 298 paddusb m5, m0 299 psubusb m2, m1 300 psubusb m3, m1 301 psubusb m4, m1 302 psubusb m5, m1 303 %1 [%2 ], m2 304 %1 [%2+%3 ], m3 305 %1 [%2+%3*2], m4 306 %1 [%2+%4 ], m5 307%endmacro 308 309INIT_MMX mmxext 310; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) 311%if ARCH_X86_64 312cglobal h264_idct_dc_add_8, 3, 4, 0 313 movsx r3, word [r1] 314 mov dword [r1], 0 315 DC_ADD_MMXEXT_INIT r3, r2 316 DC_ADD_MMXEXT_OP movh, r0, r2, r3 317 RET 318 319; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) 320cglobal h264_idct8_dc_add_8, 3, 4, 0 321 movsx r3, word [r1] 322 mov dword [r1], 0 323 DC_ADD_MMXEXT_INIT r3, r2 324 DC_ADD_MMXEXT_OP mova, r0, r2, r3 325 lea r0, [r0+r2*4] 326 DC_ADD_MMXEXT_OP mova, r0, r2, r3 327 RET 328%else 329; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) 330cglobal h264_idct_dc_add_8, 2, 3, 0 331 movsx r2, word [r1] 332 mov dword [r1], 0 333 mov r1, r2m 334 DC_ADD_MMXEXT_INIT r2, r1 335 DC_ADD_MMXEXT_OP movh, r0, r1, r2 336 RET 337 338; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) 339cglobal h264_idct8_dc_add_8, 2, 3, 0 340 movsx r2, word [r1] 341 mov dword [r1], 0 342 mov r1, r2m 343 DC_ADD_MMXEXT_INIT r2, r1 344 DC_ADD_MMXEXT_OP mova, r0, r1, r2 345 lea r0, [r0+r1*4] 346 DC_ADD_MMXEXT_OP mova, r0, r1, r2 347 RET 348%endif 349 350INIT_MMX mmx 351; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset, 352; int16_t *block, int stride, 353; const uint8_t nnzc[6 * 8]) 354cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg 355 xor r5, r5 356%ifdef PIC 357 lea picregq, [scan8_mem] 358%endif 359.nextblock: 360 movzx r6, byte [scan8+r5] 361 movzx r6, byte [r4+r6] 362 test r6, r6 363 jz .skipblock 364 mov r6d, dword [r1+r5*4] 365 lea r6, [r0+r6] 366 IDCT4_ADD r6, r2, r3 367.skipblock: 368 inc r5 369 add r2, 32 370 cmp r5, 16 371 jl .nextblock 372 REP_RET 373 374; void ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset, 375; int16_t *block, int stride, 376; const uint8_t nnzc[6 * 8]) 377cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg 378 %assign pad 128+4-(stack_offset&7) 379 SUB rsp, pad 380 381 xor r5, r5 382%ifdef PIC 383 lea picregq, [scan8_mem] 384%endif 385.nextblock: 386 movzx r6, byte [scan8+r5] 387 movzx r6, byte [r4+r6] 388 test r6, r6 389 jz .skipblock 390 mov r6d, dword [r1+r5*4] 391 add r6, r0 392 add word [r2], 32 393 IDCT8_ADD_MMX_START r2 , rsp 394 IDCT8_ADD_MMX_START r2+8, rsp+64 395 IDCT8_ADD_MMX_END r6 , rsp, r3, r2 396 mov r6d, dword [r1+r5*4] 397 lea r6, [r0+r6+4] 398 IDCT8_ADD_MMX_END r6 , rsp+8, r3 399.skipblock: 400 add r5, 4 401 add r2, 128 402 cmp r5, 16 403 jl .nextblock 404 ADD rsp, pad 405 RET 406 407INIT_MMX mmxext 408; void ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset, 409; int16_t *block, int stride, 410; const uint8_t nnzc[6 * 8]) 411cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 412 xor r5, r5 413%ifdef PIC 414 lea picregq, [scan8_mem] 415%endif 416.nextblock: 417 movzx r6, byte [scan8+r5] 418 movzx r6, byte [r4+r6] 419 test r6, r6 420 jz .skipblock 421 cmp r6, 1 422 jnz .no_dc 423 movsx r6, word [r2] 424 test r6, r6 425 jz .no_dc 426 mov word [r2], 0 427 DC_ADD_MMXEXT_INIT r6, r3 428%if ARCH_X86_64 == 0 429%define dst2q r1 430%define dst2d r1d 431%endif 432 mov dst2d, dword [r1+r5*4] 433 lea dst2q, [r0+dst2q] 434 DC_ADD_MMXEXT_OP movh, dst2q, r3, r6 435%if ARCH_X86_64 == 0 436 mov r1, r1m 437%endif 438 inc r5 439 add r2, 32 440 cmp r5, 16 441 jl .nextblock 442 REP_RET 443.no_dc: 444 mov r6d, dword [r1+r5*4] 445 add r6, r0 446 IDCT4_ADD r6, r2, r3 447.skipblock: 448 inc r5 449 add r2, 32 450 cmp r5, 16 451 jl .nextblock 452 REP_RET 453 454INIT_MMX mmx 455; void ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset, 456; int16_t *block, int stride, 457; const uint8_t nnzc[6 * 8]) 458cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg 459 xor r5, r5 460%ifdef PIC 461 lea picregq, [scan8_mem] 462%endif 463.nextblock: 464 movzx r6, byte [scan8+r5] 465 movzx r6, byte [r4+r6] 466 or r6w, word [r2] 467 test r6, r6 468 jz .skipblock 469 mov r6d, dword [r1+r5*4] 470 add r6, r0 471 IDCT4_ADD r6, r2, r3 472.skipblock: 473 inc r5 474 add r2, 32 475 cmp r5, 16 476 jl .nextblock 477 REP_RET 478 479INIT_MMX mmxext 480; void ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset, 481; int16_t *block, int stride, 482; const uint8_t nnzc[6 * 8]) 483cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 484 xor r5, r5 485%ifdef PIC 486 lea picregq, [scan8_mem] 487%endif 488.nextblock: 489 movzx r6, byte [scan8+r5] 490 movzx r6, byte [r4+r6] 491 test r6, r6 492 jz .try_dc 493 mov r6d, dword [r1+r5*4] 494 lea r6, [r0+r6] 495 IDCT4_ADD r6, r2, r3 496 inc r5 497 add r2, 32 498 cmp r5, 16 499 jl .nextblock 500 REP_RET 501.try_dc: 502 movsx r6, word [r2] 503 test r6, r6 504 jz .skipblock 505 mov word [r2], 0 506 DC_ADD_MMXEXT_INIT r6, r3 507%if ARCH_X86_64 == 0 508%define dst2q r1 509%define dst2d r1d 510%endif 511 mov dst2d, dword [r1+r5*4] 512 add dst2q, r0 513 DC_ADD_MMXEXT_OP movh, dst2q, r3, r6 514%if ARCH_X86_64 == 0 515 mov r1, r1m 516%endif 517.skipblock: 518 inc r5 519 add r2, 32 520 cmp r5, 16 521 jl .nextblock 522 REP_RET 523 524; void ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset, 525; int16_t *block, int stride, 526; const uint8_t nnzc[6 * 8]) 527cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 528 %assign pad 128+4-(stack_offset&7) 529 SUB rsp, pad 530 531 xor r5, r5 532%ifdef PIC 533 lea picregq, [scan8_mem] 534%endif 535.nextblock: 536 movzx r6, byte [scan8+r5] 537 movzx r6, byte [r4+r6] 538 test r6, r6 539 jz .skipblock 540 cmp r6, 1 541 jnz .no_dc 542 movsx r6, word [r2] 543 test r6, r6 544 jz .no_dc 545 mov word [r2], 0 546 DC_ADD_MMXEXT_INIT r6, r3 547%if ARCH_X86_64 == 0 548%define dst2q r1 549%define dst2d r1d 550%endif 551 mov dst2d, dword [r1+r5*4] 552 lea dst2q, [r0+dst2q] 553 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 554 lea dst2q, [dst2q+r3*4] 555 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 556%if ARCH_X86_64 == 0 557 mov r1, r1m 558%endif 559 add r5, 4 560 add r2, 128 561 cmp r5, 16 562 jl .nextblock 563 564 ADD rsp, pad 565 RET 566.no_dc: 567 mov r6d, dword [r1+r5*4] 568 add r6, r0 569 add word [r2], 32 570 IDCT8_ADD_MMX_START r2 , rsp 571 IDCT8_ADD_MMX_START r2+8, rsp+64 572 IDCT8_ADD_MMX_END r6 , rsp, r3, r2 573 mov r6d, dword [r1+r5*4] 574 lea r6, [r0+r6+4] 575 IDCT8_ADD_MMX_END r6 , rsp+8, r3 576.skipblock: 577 add r5, 4 578 add r2, 128 579 cmp r5, 16 580 jl .nextblock 581 582 ADD rsp, pad 583 RET 584 585INIT_XMM sse2 586; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset, 587; int16_t *block, int stride, 588; const uint8_t nnzc[6 * 8]) 589cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 590 xor r5, r5 591%ifdef PIC 592 lea picregq, [scan8_mem] 593%endif 594.nextblock: 595 movzx r6, byte [scan8+r5] 596 movzx r6, byte [r4+r6] 597 test r6, r6 598 jz .skipblock 599 cmp r6, 1 600 jnz .no_dc 601 movsx r6, word [r2] 602 test r6, r6 603 jz .no_dc 604INIT_MMX cpuname 605 mov word [r2], 0 606 DC_ADD_MMXEXT_INIT r6, r3 607%if ARCH_X86_64 == 0 608%define dst2q r1 609%define dst2d r1d 610%endif 611 mov dst2d, dword [r1+r5*4] 612 add dst2q, r0 613 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 614 lea dst2q, [dst2q+r3*4] 615 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 616%if ARCH_X86_64 == 0 617 mov r1, r1m 618%endif 619 add r5, 4 620 add r2, 128 621 cmp r5, 16 622 jl .nextblock 623 REP_RET 624.no_dc: 625INIT_XMM cpuname 626 mov dst2d, dword [r1+r5*4] 627 add dst2q, r0 628 IDCT8_ADD_SSE dst2q, r2, r3, r6 629%if ARCH_X86_64 == 0 630 mov r1, r1m 631%endif 632.skipblock: 633 add r5, 4 634 add r2, 128 635 cmp r5, 16 636 jl .nextblock 637 REP_RET 638 639INIT_MMX mmx 640h264_idct_add8_mmx_plane: 641.nextblock: 642 movzx r6, byte [scan8+r5] 643 movzx r6, byte [r4+r6] 644 or r6w, word [r2] 645 test r6, r6 646 jz .skipblock 647%if ARCH_X86_64 648 mov r0d, dword [r1+r5*4] 649 add r0, [dst2q] 650%else 651 mov r0, r1m ; XXX r1m here is actually r0m of the calling func 652 mov r0, [r0] 653 add r0, dword [r1+r5*4] 654%endif 655 IDCT4_ADD r0, r2, r3 656.skipblock: 657 inc r5 658 add r2, 32 659 test r5, 3 660 jnz .nextblock 661 rep ret 662 663; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset, 664; int16_t *block, int stride, 665; const uint8_t nnzc[6 * 8]) 666cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 667 mov r5, 16 668 add r2, 512 669%ifdef PIC 670 lea picregq, [scan8_mem] 671%endif 672%if ARCH_X86_64 673 mov dst2q, r0 674%endif 675 call h264_idct_add8_mmx_plane 676 mov r5, 32 677 add r2, 384 678%if ARCH_X86_64 679 add dst2q, gprsize 680%else 681 add r0mp, gprsize 682%endif 683 call h264_idct_add8_mmx_plane 684 RET 685 686h264_idct_add8_mmxext_plane: 687.nextblock: 688 movzx r6, byte [scan8+r5] 689 movzx r6, byte [r4+r6] 690 test r6, r6 691 jz .try_dc 692%if ARCH_X86_64 693 mov r0d, dword [r1+r5*4] 694 add r0, [dst2q] 695%else 696 mov r0, r1m ; XXX r1m here is actually r0m of the calling func 697 mov r0, [r0] 698 add r0, dword [r1+r5*4] 699%endif 700 IDCT4_ADD r0, r2, r3 701 inc r5 702 add r2, 32 703 test r5, 3 704 jnz .nextblock 705 rep ret 706.try_dc: 707 movsx r6, word [r2] 708 test r6, r6 709 jz .skipblock 710 mov word [r2], 0 711 DC_ADD_MMXEXT_INIT r6, r3 712%if ARCH_X86_64 713 mov r0d, dword [r1+r5*4] 714 add r0, [dst2q] 715%else 716 mov r0, r1m ; XXX r1m here is actually r0m of the calling func 717 mov r0, [r0] 718 add r0, dword [r1+r5*4] 719%endif 720 DC_ADD_MMXEXT_OP movh, r0, r3, r6 721.skipblock: 722 inc r5 723 add r2, 32 724 test r5, 3 725 jnz .nextblock 726 rep ret 727 728INIT_MMX mmxext 729; void ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset, 730; int16_t *block, int stride, 731; const uint8_t nnzc[6 * 8]) 732cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 733 mov r5, 16 734 add r2, 512 735%if ARCH_X86_64 736 mov dst2q, r0 737%endif 738%ifdef PIC 739 lea picregq, [scan8_mem] 740%endif 741 call h264_idct_add8_mmxext_plane 742 mov r5, 32 743 add r2, 384 744%if ARCH_X86_64 745 add dst2q, gprsize 746%else 747 add r0mp, gprsize 748%endif 749 call h264_idct_add8_mmxext_plane 750 RET 751 752; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered 753h264_idct_dc_add8_mmxext: 754 movd m0, [r2 ] ; 0 0 X D 755 mov word [r2+ 0], 0 756 punpcklwd m0, [r2+32] ; x X d D 757 mov word [r2+32], 0 758 paddsw m0, [pw_32] 759 psraw m0, 6 760 punpcklwd m0, m0 ; d d D D 761 pxor m1, m1 ; 0 0 0 0 762 psubw m1, m0 ; -d-d-D-D 763 packuswb m0, m1 ; -d-d-D-D d d D D 764 pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D 765 punpcklwd m0, m0 ; d d d d D D D D 766 lea r6, [r3*3] 767 DC_ADD_MMXEXT_OP movq, r0, r3, r6 768 ret 769 770ALIGN 16 771INIT_XMM sse2 772; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride 773h264_add8x4_idct_sse2: 774 movq m0, [r2+ 0] 775 movq m1, [r2+ 8] 776 movq m2, [r2+16] 777 movq m3, [r2+24] 778 movhps m0, [r2+32] 779 movhps m1, [r2+40] 780 movhps m2, [r2+48] 781 movhps m3, [r2+56] 782 IDCT4_1D w,0,1,2,3,4,5 783 TRANSPOSE2x4x4W 0,1,2,3,4 784 paddw m0, [pw_32] 785 IDCT4_1D w,0,1,2,3,4,5 786 pxor m7, m7 787 mova [r2+ 0], m7 788 mova [r2+16], m7 789 mova [r2+32], m7 790 mova [r2+48], m7 791 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 792 lea r0, [r0+r3*2] 793 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 794 ret 795 796%macro add16_sse2_cycle 2 797 movzx r0, word [r4+%2] 798 test r0, r0 799 jz .cycle%1end 800 mov r0d, dword [r1+%1*8] 801%if ARCH_X86_64 802 add r0, r5 803%else 804 add r0, r0m 805%endif 806 call h264_add8x4_idct_sse2 807.cycle%1end: 808%if %1 < 7 809 add r2, 64 810%endif 811%endmacro 812 813; void ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset, 814; int16_t *block, int stride, 815; const uint8_t nnzc[6 * 8]) 816cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8 817%if ARCH_X86_64 818 mov r5, r0 819%endif 820 ; unrolling of the loop leads to an average performance gain of 821 ; 20-25% 822 add16_sse2_cycle 0, 0xc 823 add16_sse2_cycle 1, 0x14 824 add16_sse2_cycle 2, 0xe 825 add16_sse2_cycle 3, 0x16 826 add16_sse2_cycle 4, 0x1c 827 add16_sse2_cycle 5, 0x24 828 add16_sse2_cycle 6, 0x1e 829 add16_sse2_cycle 7, 0x26 830 RET 831 832%macro add16intra_sse2_cycle 2 833 movzx r0, word [r4+%2] 834 test r0, r0 835 jz .try%1dc 836 mov r0d, dword [r1+%1*8] 837%if ARCH_X86_64 838 add r0, r7 839%else 840 add r0, r0m 841%endif 842 call h264_add8x4_idct_sse2 843 jmp .cycle%1end 844.try%1dc: 845 movsx r0, word [r2 ] 846 or r0w, word [r2+32] 847 jz .cycle%1end 848 mov r0d, dword [r1+%1*8] 849%if ARCH_X86_64 850 add r0, r7 851%else 852 add r0, r0m 853%endif 854 call h264_idct_dc_add8_mmxext 855.cycle%1end: 856%if %1 < 7 857 add r2, 64 858%endif 859%endmacro 860 861; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset, 862; int16_t *block, int stride, 863; const uint8_t nnzc[6 * 8]) 864cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8 865%if ARCH_X86_64 866 mov r7, r0 867%endif 868 add16intra_sse2_cycle 0, 0xc 869 add16intra_sse2_cycle 1, 0x14 870 add16intra_sse2_cycle 2, 0xe 871 add16intra_sse2_cycle 3, 0x16 872 add16intra_sse2_cycle 4, 0x1c 873 add16intra_sse2_cycle 5, 0x24 874 add16intra_sse2_cycle 6, 0x1e 875 add16intra_sse2_cycle 7, 0x26 876 RET 877 878%macro add8_sse2_cycle 2 879 movzx r0, word [r4+%2] 880 test r0, r0 881 jz .try%1dc 882%if ARCH_X86_64 883 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] 884 add r0, [r7] 885%else 886 mov r0, r0m 887 mov r0, [r0] 888 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] 889%endif 890 call h264_add8x4_idct_sse2 891 jmp .cycle%1end 892.try%1dc: 893 movsx r0, word [r2 ] 894 or r0w, word [r2+32] 895 jz .cycle%1end 896%if ARCH_X86_64 897 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] 898 add r0, [r7] 899%else 900 mov r0, r0m 901 mov r0, [r0] 902 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] 903%endif 904 call h264_idct_dc_add8_mmxext 905.cycle%1end: 906%if %1 == 1 907 add r2, 384+64 908%elif %1 < 3 909 add r2, 64 910%endif 911%endmacro 912 913; void ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset, 914; int16_t *block, int stride, 915; const uint8_t nnzc[6 * 8]) 916cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8 917 add r2, 512 918%if ARCH_X86_64 919 mov r7, r0 920%endif 921 add8_sse2_cycle 0, 0x34 922 add8_sse2_cycle 1, 0x3c 923%if ARCH_X86_64 924 add r7, gprsize 925%else 926 add r0mp, gprsize 927%endif 928 add8_sse2_cycle 2, 0x5c 929 add8_sse2_cycle 3, 0x64 930 RET 931 932;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul) 933 934%macro WALSH4_1D 5 935 SUMSUB_BADC w, %4, %3, %2, %1, %5 936 SUMSUB_BADC w, %4, %2, %3, %1, %5 937 SWAP %1, %4, %3 938%endmacro 939 940%macro DEQUANT_MMX 3 941 mova m7, [pw_1] 942 mova m4, %1 943 punpcklwd %1, m7 944 punpckhwd m4, m7 945 mova m5, %2 946 punpcklwd %2, m7 947 punpckhwd m5, m7 948 movd m7, t3d 949 punpckldq m7, m7 950 pmaddwd %1, m7 951 pmaddwd %2, m7 952 pmaddwd m4, m7 953 pmaddwd m5, m7 954 psrad %1, %3 955 psrad %2, %3 956 psrad m4, %3 957 psrad m5, %3 958 packssdw %1, m4 959 packssdw %2, m5 960%endmacro 961 962%macro STORE_WORDS 5-9 963%if cpuflag(sse) 964 movd t0d, %1 965 psrldq %1, 4 966 movd t1d, %1 967 psrldq %1, 4 968 mov [t2+%2*32], t0w 969 mov [t2+%4*32], t1w 970 shr t0d, 16 971 shr t1d, 16 972 mov [t2+%3*32], t0w 973 mov [t2+%5*32], t1w 974 movd t0d, %1 975 psrldq %1, 4 976 movd t1d, %1 977 mov [t2+%6*32], t0w 978 mov [t2+%8*32], t1w 979 shr t0d, 16 980 shr t1d, 16 981 mov [t2+%7*32], t0w 982 mov [t2+%9*32], t1w 983%else 984 movd t0d, %1 985 psrlq %1, 32 986 movd t1d, %1 987 mov [t2+%2*32], t0w 988 mov [t2+%4*32], t1w 989 shr t0d, 16 990 shr t1d, 16 991 mov [t2+%3*32], t0w 992 mov [t2+%5*32], t1w 993%endif 994%endmacro 995 996%macro DEQUANT_STORE 1 997%if cpuflag(sse2) 998 movd xmm4, t3d 999 movq xmm5, [pw_1] 1000 pshufd xmm4, xmm4, 0 1001 movq2dq xmm0, m0 1002 movq2dq xmm1, m1 1003 movq2dq xmm2, m2 1004 movq2dq xmm3, m3 1005 punpcklwd xmm0, xmm5 1006 punpcklwd xmm1, xmm5 1007 punpcklwd xmm2, xmm5 1008 punpcklwd xmm3, xmm5 1009 pmaddwd xmm0, xmm4 1010 pmaddwd xmm1, xmm4 1011 pmaddwd xmm2, xmm4 1012 pmaddwd xmm3, xmm4 1013 psrad xmm0, %1 1014 psrad xmm1, %1 1015 psrad xmm2, %1 1016 psrad xmm3, %1 1017 packssdw xmm0, xmm1 1018 packssdw xmm2, xmm3 1019 STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7 1020 STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15 1021%else 1022 DEQUANT_MMX m0, m1, %1 1023 STORE_WORDS m0, 0, 1, 4, 5 1024 STORE_WORDS m1, 2, 3, 6, 7 1025 1026 DEQUANT_MMX m2, m3, %1 1027 STORE_WORDS m2, 8, 9, 12, 13 1028 STORE_WORDS m3, 10, 11, 14, 15 1029%endif 1030%endmacro 1031 1032%macro IDCT_DC_DEQUANT 1 1033cglobal h264_luma_dc_dequant_idct, 3, 4, %1 1034 ; manually spill XMM registers for Win64 because 1035 ; the code here is initialized with INIT_MMX 1036 WIN64_SPILL_XMM %1 1037 movq m3, [r1+24] 1038 movq m2, [r1+16] 1039 movq m1, [r1+ 8] 1040 movq m0, [r1+ 0] 1041 WALSH4_1D 0,1,2,3,4 1042 TRANSPOSE4x4W 0,1,2,3,4 1043 WALSH4_1D 0,1,2,3,4 1044 1045; shift, tmp, output, qmul 1046%if WIN64 1047 DECLARE_REG_TMP 0,3,1,2 1048 ; we can't avoid this, because r0 is the shift register (ecx) on win64 1049 xchg r0, t2 1050%elif ARCH_X86_64 1051 DECLARE_REG_TMP 3,1,0,2 1052%else 1053 DECLARE_REG_TMP 1,3,0,2 1054%endif 1055 1056 cmp t3d, 32767 1057 jg .big_qmul 1058 add t3d, 128 << 16 1059 DEQUANT_STORE 8 1060 RET 1061.big_qmul: 1062 bsr t0d, t3d 1063 add t3d, 128 << 16 1064 mov t1d, 7 1065 cmp t0d, t1d 1066 cmovg t0d, t1d 1067 inc t1d 1068 shr t3d, t0b 1069 sub t1d, t0d 1070%if cpuflag(sse2) 1071 movd xmm6, t1d 1072 DEQUANT_STORE xmm6 1073%else 1074 movd m6, t1d 1075 DEQUANT_STORE m6 1076%endif 1077 RET 1078%endmacro 1079 1080INIT_MMX mmx 1081IDCT_DC_DEQUANT 0 1082INIT_MMX sse2 1083IDCT_DC_DEQUANT 7 1084