1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized H.264 deblocking code 3;***************************************************************************** 4;* Copyright (C) 2005-2011 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* Fiona Glaser <fiona@x264.com> 8;* Oskar Arvidsson <oskar@irock.se> 9;* 10;* This file is part of FFmpeg. 11;* 12;* FFmpeg is free software; you can redistribute it and/or 13;* modify it under the terms of the GNU Lesser General Public 14;* License as published by the Free Software Foundation; either 15;* version 2.1 of the License, or (at your option) any later version. 16;* 17;* FFmpeg is distributed in the hope that it will be useful, 18;* but WITHOUT ANY WARRANTY; without even the implied warranty of 19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20;* Lesser General Public License for more details. 21;* 22;* You should have received a copy of the GNU Lesser General Public 23;* License along with FFmpeg; if not, write to the Free Software 24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25;****************************************************************************** 26 27%include "libavutil/x86/x86util.asm" 28 29SECTION_RODATA 30 31pb_A1: times 16 db 0xA1 32pb_3_1: times 4 db 3, 1 33 34SECTION .text 35 36cextern pb_0 37cextern pb_1 38cextern pb_3 39 40; expands to [base],...,[base+7*stride] 41%define PASS8ROWS(base, base3, stride, stride3) \ 42 [base], [base+stride], [base+stride*2], [base3], \ 43 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] 44 45%define PASS8ROWS(base, base3, stride, stride3, offset) \ 46 PASS8ROWS(base+offset, base3+offset, stride, stride3) 47 48; in: 8 rows of 4 bytes in %4..%11 49; out: 4 rows of 8 bytes in m0..m3 50%macro TRANSPOSE4x8_LOAD 11 51 movh m0, %4 52 movh m2, %5 53 movh m1, %6 54 movh m3, %7 55 punpckl%1 m0, m2 56 punpckl%1 m1, m3 57 mova m2, m0 58 punpckl%2 m0, m1 59 punpckh%2 m2, m1 60 61 movh m4, %8 62 movh m6, %9 63 movh m5, %10 64 movh m7, %11 65 punpckl%1 m4, m6 66 punpckl%1 m5, m7 67 mova m6, m4 68 punpckl%2 m4, m5 69 punpckh%2 m6, m5 70 71 punpckh%3 m1, m0, m4 72 punpckh%3 m3, m2, m6 73 punpckl%3 m0, m4 74 punpckl%3 m2, m6 75%endmacro 76 77; in: 4 rows of 8 bytes in m0..m3 78; out: 8 rows of 4 bytes in %1..%8 79%macro TRANSPOSE8x4B_STORE 8 80 punpckhdq m4, m0, m0 81 punpckhdq m5, m1, m1 82 punpckhdq m6, m2, m2 83 84 punpcklbw m0, m1 85 punpcklbw m2, m3 86 punpcklwd m1, m0, m2 87 punpckhwd m0, m2 88 movh %1, m1 89 punpckhdq m1, m1 90 movh %2, m1 91 movh %3, m0 92 punpckhdq m0, m0 93 movh %4, m0 94 95 punpckhdq m3, m3 96 punpcklbw m4, m5 97 punpcklbw m6, m3 98 punpcklwd m5, m4, m6 99 punpckhwd m4, m6 100 movh %5, m5 101 punpckhdq m5, m5 102 movh %6, m5 103 movh %7, m4 104 punpckhdq m4, m4 105 movh %8, m4 106%endmacro 107 108%macro TRANSPOSE4x8B_LOAD 8 109 TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 110%endmacro 111 112%macro SBUTTERFLY3 4 113 punpckh%1 %4, %2, %3 114 punpckl%1 %2, %3 115%endmacro 116 117; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 118; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] 119%macro TRANSPOSE6x8_MEM 9 120 RESET_MM_PERMUTATION 121 movq m0, %1 122 movq m1, %2 123 movq m2, %3 124 movq m3, %4 125 movq m4, %5 126 movq m5, %6 127 movq m6, %7 128 SBUTTERFLY bw, 0, 1, 7 129 SBUTTERFLY bw, 2, 3, 7 130 SBUTTERFLY bw, 4, 5, 7 131 movq [%9+0x10], m3 132 SBUTTERFLY3 bw, m6, %8, m7 133 SBUTTERFLY wd, 0, 2, 3 134 SBUTTERFLY wd, 4, 6, 3 135 punpckhdq m0, m4 136 movq [%9+0x00], m0 137 SBUTTERFLY3 wd, m1, [%9+0x10], m3 138 SBUTTERFLY wd, 5, 7, 0 139 SBUTTERFLY dq, 1, 5, 0 140 SBUTTERFLY dq, 2, 6, 0 141 punpckldq m3, m7 142 movq [%9+0x10], m2 143 movq [%9+0x20], m6 144 movq [%9+0x30], m1 145 movq [%9+0x40], m5 146 movq [%9+0x50], m3 147 RESET_MM_PERMUTATION 148%endmacro 149 150; in: 8 rows of 8 in %1..%8 151; out: 8 rows of 8 in %9..%16 152%macro TRANSPOSE8x8_MEM 16 153 RESET_MM_PERMUTATION 154 movq m0, %1 155 movq m1, %2 156 movq m2, %3 157 movq m3, %4 158 movq m4, %5 159 movq m5, %6 160 movq m6, %7 161 SBUTTERFLY bw, 0, 1, 7 162 SBUTTERFLY bw, 2, 3, 7 163 SBUTTERFLY bw, 4, 5, 7 164 SBUTTERFLY3 bw, m6, %8, m7 165 movq %9, m5 166 SBUTTERFLY wd, 0, 2, 5 167 SBUTTERFLY wd, 4, 6, 5 168 SBUTTERFLY wd, 1, 3, 5 169 movq %11, m6 170 movq m6, %9 171 SBUTTERFLY wd, 6, 7, 5 172 SBUTTERFLY dq, 0, 4, 5 173 SBUTTERFLY dq, 1, 6, 5 174 movq %9, m0 175 movq %10, m4 176 movq %13, m1 177 movq %14, m6 178 SBUTTERFLY3 dq, m2, %11, m0 179 SBUTTERFLY dq, 3, 7, 4 180 movq %11, m2 181 movq %12, m0 182 movq %15, m3 183 movq %16, m7 184 RESET_MM_PERMUTATION 185%endmacro 186 187; out: %4 = |%1-%2|>%3 188; clobbers: %5 189%macro DIFF_GT 5 190%if avx_enabled == 0 191 mova %5, %2 192 mova %4, %1 193 psubusb %5, %1 194 psubusb %4, %2 195%else 196 psubusb %5, %2, %1 197 psubusb %4, %1, %2 198%endif 199 por %4, %5 200 psubusb %4, %3 201%endmacro 202 203; out: %4 = |%1-%2|>%3 204; clobbers: %5 205%macro DIFF_GT2 5 206%if ARCH_X86_64 207 psubusb %5, %2, %1 208 psubusb %4, %1, %2 209%else 210 mova %5, %2 211 mova %4, %1 212 psubusb %5, %1 213 psubusb %4, %2 214%endif 215 psubusb %5, %3 216 psubusb %4, %3 217 pcmpeqb %4, %5 218%endmacro 219 220; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 221; out: m5=beta-1, m7=mask, %3=alpha-1 222; clobbers: m4,m6 223%macro LOAD_MASK 2-3 224 movd m4, %1 225 movd m5, %2 226 SPLATW m4, m4 227 SPLATW m5, m5 228 packuswb m4, m4 ; 16x alpha-1 229 packuswb m5, m5 ; 16x beta-1 230%if %0>2 231 mova %3, m4 232%endif 233 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 234 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 235 por m7, m4 236 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 237 por m7, m4 238 pxor m6, m6 239 pcmpeqb m7, m6 240%endmacro 241 242; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) 243; out: m1=p0' m2=q0' 244; clobbers: m0,3-6 245%macro DEBLOCK_P0_Q0 0 246 pcmpeqb m4, m4 247 pxor m5, m1, m2 ; p0^q0 248 pxor m3, m4 249 pand m5, [pb_1] ; (p0^q0)&1 250 pavgb m3, m0 ; (p1 - q1 + 256)>>1 251 pxor m4, m1 252 pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 253 pavgb m4, m2 ; (q0 - p0 + 256)>>1 254 pavgb m3, m5 255 mova m6, [pb_A1] 256 paddusb m3, m4 ; d+128+33 257 psubusb m6, m3 258 psubusb m3, [pb_A1] 259 pminub m6, m7 260 pminub m3, m7 261 psubusb m1, m6 262 psubusb m2, m3 263 paddusb m1, m3 264 paddusb m2, m6 265%endmacro 266 267; in: m1=p0 m2=q0 268; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp 269; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) 270; clobbers: q2, tmp, tc0 271%macro LUMA_Q1 6 272 pavgb %6, m1, m2 273 pavgb %2, %6 ; avg(p2,avg(p0,q0)) 274 pxor %6, %3 275 pand %6, [pb_1] ; (p2^avg(p0,q0))&1 276 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 277 psubusb %6, %1, %5 278 paddusb %5, %1 279 pmaxub %2, %6 280 pminub %2, %5 281 mova %4, %2 282%endmacro 283 284%if ARCH_X86_64 285;----------------------------------------------------------------------------- 286; void ff_deblock_v_luma(uint8_t *pix, int stride, int alpha, int beta, 287; int8_t *tc0) 288;----------------------------------------------------------------------------- 289%macro DEBLOCK_LUMA 0 290cglobal deblock_v_luma_8, 5,5,10 291 movd m8, [r4] ; tc0 292 lea r4, [r1*3] 293 dec r2d ; alpha-1 294 neg r4 295 dec r3d ; beta-1 296 add r4, r0 ; pix-3*stride 297 298 mova m0, [r4+r1] ; p1 299 mova m1, [r4+2*r1] ; p0 300 mova m2, [r0] ; q0 301 mova m3, [r0+r1] ; q1 302 LOAD_MASK r2d, r3d 303 304 punpcklbw m8, m8 305 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] 306 pcmpeqb m9, m9 307 pcmpeqb m9, m8 308 pandn m9, m7 309 pand m8, m9 310 311 movdqa m3, [r4] ; p2 312 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 313 pand m6, m9 314 psubb m7, m8, m6 315 pand m6, m8 316 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 317 318 movdqa m4, [r0+2*r1] ; q2 319 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 320 pand m6, m9 321 pand m8, m6 322 psubb m7, m6 323 mova m3, [r0+r1] 324 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 325 326 DEBLOCK_P0_Q0 327 mova [r4+2*r1], m1 328 mova [r0], m2 329 RET 330 331;----------------------------------------------------------------------------- 332; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta, 333; int8_t *tc0) 334;----------------------------------------------------------------------------- 335INIT_MMX cpuname 336cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64 337 movsxd r7, r1d 338 lea r8, [r7+r7*2] 339 lea r6, [r0-4] 340 lea r5, [r0-4+r8] 341%if WIN64 342 %define pix_tmp rsp+0x30 ; shadow space + r4 343%else 344 %define pix_tmp rsp 345%endif 346 347 ; transpose 6x16 -> tmp space 348 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp 349 lea r6, [r6+r7*8] 350 lea r5, [r5+r7*8] 351 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8 352 353 ; vertical filter 354 ; alpha, beta, tc0 are still in r2d, r3d, r4 355 ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them 356 lea r0, [pix_tmp+0x30] 357 mov r1d, 0x10 358%if WIN64 359 mov [rsp+0x20], r4 360%endif 361 call deblock_v_luma_8 362 363 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) 364 add r6, 2 365 add r5, 2 366 movq m0, [pix_tmp+0x18] 367 movq m1, [pix_tmp+0x28] 368 movq m2, [pix_tmp+0x38] 369 movq m3, [pix_tmp+0x48] 370 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) 371 372 shl r7, 3 373 sub r6, r7 374 sub r5, r7 375 shr r7, 3 376 movq m0, [pix_tmp+0x10] 377 movq m1, [pix_tmp+0x20] 378 movq m2, [pix_tmp+0x30] 379 movq m3, [pix_tmp+0x40] 380 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) 381 382 RET 383%endmacro 384 385INIT_XMM sse2 386DEBLOCK_LUMA 387%if HAVE_AVX_EXTERNAL 388INIT_XMM avx 389DEBLOCK_LUMA 390%endif 391 392%else 393 394%macro DEBLOCK_LUMA 2 395;----------------------------------------------------------------------------- 396; void ff_deblock_v8_luma(uint8_t *pix, int stride, int alpha, int beta, 397; int8_t *tc0) 398;----------------------------------------------------------------------------- 399cglobal deblock_%1_luma_8, 5,5,8,2*%2 400 lea r4, [r1*3] 401 dec r2 ; alpha-1 402 neg r4 403 dec r3 ; beta-1 404 add r4, r0 ; pix-3*stride 405 406 mova m0, [r4+r1] ; p1 407 mova m1, [r4+2*r1] ; p0 408 mova m2, [r0] ; q0 409 mova m3, [r0+r1] ; q1 410 LOAD_MASK r2, r3 411 412 mov r3, r4mp 413 pcmpeqb m3, m3 414 movd m4, [r3] ; tc0 415 punpcklbw m4, m4 416 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] 417 mova [esp+%2], m4 ; tc 418 pcmpgtb m4, m3 419 mova m3, [r4] ; p2 420 pand m4, m7 421 mova [esp], m4 ; mask 422 423 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 424 pand m6, m4 425 pand m4, [esp+%2] ; tc 426 psubb m7, m4, m6 427 pand m6, m4 428 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 429 430 mova m4, [r0+2*r1] ; q2 431 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 432 pand m6, [esp] ; mask 433 mova m5, [esp+%2] ; tc 434 psubb m7, m6 435 pand m5, m6 436 mova m3, [r0+r1] 437 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 438 439 DEBLOCK_P0_Q0 440 mova [r4+2*r1], m1 441 mova [r0], m2 442 RET 443 444;----------------------------------------------------------------------------- 445; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta, 446; int8_t *tc0) 447;----------------------------------------------------------------------------- 448INIT_MMX cpuname 449cglobal deblock_h_luma_8, 0,5,8,0x60+HAVE_ALIGNED_STACK*12 450 mov r0, r0mp 451 mov r3, r1m 452 lea r4, [r3*3] 453 sub r0, 4 454 lea r1, [r0+r4] 455%define pix_tmp esp+12*HAVE_ALIGNED_STACK 456 457 ; transpose 6x16 -> tmp space 458 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp 459 lea r0, [r0+r3*8] 460 lea r1, [r1+r3*8] 461 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 462 463 ; vertical filter 464 lea r0, [pix_tmp+0x30] 465 PUSH dword r4m 466 PUSH dword r3m 467 PUSH dword r2m 468 PUSH dword 16 469 PUSH dword r0 470 call deblock_%1_luma_8 471%ifidn %1, v8 472 add dword [esp ], 8 ; pix_tmp+0x38 473 add dword [esp+16], 2 ; tc0+2 474 call deblock_%1_luma_8 475%endif 476 ADD esp, 20 477 478 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) 479 mov r0, r0mp 480 sub r0, 2 481 482 movq m0, [pix_tmp+0x10] 483 movq m1, [pix_tmp+0x20] 484 lea r1, [r0+r4] 485 movq m2, [pix_tmp+0x30] 486 movq m3, [pix_tmp+0x40] 487 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) 488 489 lea r0, [r0+r3*8] 490 lea r1, [r1+r3*8] 491 movq m0, [pix_tmp+0x18] 492 movq m1, [pix_tmp+0x28] 493 movq m2, [pix_tmp+0x38] 494 movq m3, [pix_tmp+0x48] 495 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) 496 497 RET 498%endmacro ; DEBLOCK_LUMA 499 500INIT_MMX mmxext 501DEBLOCK_LUMA v8, 8 502INIT_XMM sse2 503DEBLOCK_LUMA v, 16 504%if HAVE_AVX_EXTERNAL 505INIT_XMM avx 506DEBLOCK_LUMA v, 16 507%endif 508 509%endif ; ARCH 510 511 512 513%macro LUMA_INTRA_P012 4 ; p0..p3 in memory 514%if ARCH_X86_64 515 pavgb t0, p2, p1 516 pavgb t1, p0, q0 517%else 518 mova t0, p2 519 mova t1, p0 520 pavgb t0, p1 521 pavgb t1, q0 522%endif 523 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 524 mova t5, t1 525%if ARCH_X86_64 526 paddb t2, p2, p1 527 paddb t3, p0, q0 528%else 529 mova t2, p2 530 mova t3, p0 531 paddb t2, p1 532 paddb t3, q0 533%endif 534 paddb t2, t3 535 mova t3, t2 536 mova t4, t2 537 psrlw t2, 1 538 pavgb t2, mpb_0 539 pxor t2, t0 540 pand t2, mpb_1 541 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; 542 543%if ARCH_X86_64 544 pavgb t1, p2, q1 545 psubb t2, p2, q1 546%else 547 mova t1, p2 548 mova t2, p2 549 pavgb t1, q1 550 psubb t2, q1 551%endif 552 paddb t3, t3 553 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 554 pand t2, mpb_1 555 psubb t1, t2 556 pavgb t1, p1 557 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 558 psrlw t3, 2 559 pavgb t3, mpb_0 560 pxor t3, t1 561 pand t3, mpb_1 562 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 563 564 pxor t3, p0, q1 565 pavgb t2, p0, q1 566 pand t3, mpb_1 567 psubb t2, t3 568 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 569 570 pxor t1, t2 571 pxor t2, p0 572 pand t1, mask1p 573 pand t2, mask0 574 pxor t1, t2 575 pxor t1, p0 576 mova %1, t1 ; store p0 577 578 mova t1, %4 ; p3 579 paddb t2, t1, p2 580 pavgb t1, p2 581 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 582 paddb t2, t2 583 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 584 psrlw t2, 2 585 pavgb t2, mpb_0 586 pxor t2, t1 587 pand t2, mpb_1 588 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 589 590 pxor t0, p1 591 pxor t1, p2 592 pand t0, mask1p 593 pand t1, mask1p 594 pxor t0, p1 595 pxor t1, p2 596 mova %2, t0 ; store p1 597 mova %3, t1 ; store p2 598%endmacro 599 600%macro LUMA_INTRA_SWAP_PQ 0 601 %define q1 m0 602 %define q0 m1 603 %define p0 m2 604 %define p1 m3 605 %define p2 q2 606 %define mask1p mask1q 607%endmacro 608 609%macro DEBLOCK_LUMA_INTRA 1 610 %define p1 m0 611 %define p0 m1 612 %define q0 m2 613 %define q1 m3 614 %define t0 m4 615 %define t1 m5 616 %define t2 m6 617 %define t3 m7 618%if ARCH_X86_64 619 %define p2 m8 620 %define q2 m9 621 %define t4 m10 622 %define t5 m11 623 %define mask0 m12 624 %define mask1p m13 625%if WIN64 626 %define mask1q [rsp] 627%else 628 %define mask1q [rsp-24] 629%endif 630 %define mpb_0 m14 631 %define mpb_1 m15 632%else 633 %define spill(x) [esp+16*x] 634 %define p2 [r4+r1] 635 %define q2 [r0+2*r1] 636 %define t4 spill(0) 637 %define t5 spill(1) 638 %define mask0 spill(2) 639 %define mask1p spill(3) 640 %define mask1q spill(4) 641 %define mpb_0 [pb_0] 642 %define mpb_1 [pb_1] 643%endif 644 645;----------------------------------------------------------------------------- 646; void ff_deblock_v_luma_intra(uint8_t *pix, int stride, int alpha, int beta) 647;----------------------------------------------------------------------------- 648%if WIN64 649cglobal deblock_%1_luma_intra_8, 4,6,16,0x10 650%else 651cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50 652%endif 653 lea r4, [r1*4] 654 lea r5, [r1*3] ; 3*stride 655 dec r2d ; alpha-1 656 jl .end 657 neg r4 658 dec r3d ; beta-1 659 jl .end 660 add r4, r0 ; pix-4*stride 661 mova p1, [r4+2*r1] 662 mova p0, [r4+r5] 663 mova q0, [r0] 664 mova q1, [r0+r1] 665%if ARCH_X86_64 666 pxor mpb_0, mpb_0 667 mova mpb_1, [pb_1] 668 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 669 SWAP 7, 12 ; m12=mask0 670 pavgb t5, mpb_0 671 pavgb t5, mpb_1 ; alpha/4+1 672 movdqa p2, [r4+r1] 673 movdqa q2, [r0+2*r1] 674 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 675 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 676 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 677 pand t0, mask0 678 pand t4, t0 679 pand t2, t0 680 mova mask1q, t4 681 mova mask1p, t2 682%else 683 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 684 mova m4, t5 685 mova mask0, m7 686 pavgb m4, [pb_0] 687 pavgb m4, [pb_1] ; alpha/4+1 688 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 689 pand m6, mask0 690 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 691 pand m4, m6 692 mova mask1p, m4 693 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 694 pand m4, m6 695 mova mask1q, m4 696%endif 697 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] 698 LUMA_INTRA_SWAP_PQ 699 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] 700.end: 701 RET 702 703INIT_MMX cpuname 704%if ARCH_X86_64 705;----------------------------------------------------------------------------- 706; void ff_deblock_h_luma_intra(uint8_t *pix, int stride, int alpha, int beta) 707;----------------------------------------------------------------------------- 708cglobal deblock_h_luma_intra_8, 4,9,0,0x80 709 movsxd r7, r1d 710 lea r8, [r7*3] 711 lea r6, [r0-4] 712 lea r5, [r0-4+r8] 713%if WIN64 714 %define pix_tmp rsp+0x20 ; shadow space 715%else 716 %define pix_tmp rsp 717%endif 718 719 ; transpose 8x16 -> tmp space 720 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) 721 lea r6, [r6+r7*8] 722 lea r5, [r5+r7*8] 723 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) 724 725 lea r0, [pix_tmp+0x40] 726 mov r1, 0x10 727 call deblock_v_luma_intra_8 728 729 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) 730 lea r5, [r6+r8] 731 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) 732 shl r7, 3 733 sub r6, r7 734 sub r5, r7 735 shr r7, 3 736 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) 737 RET 738%else 739cglobal deblock_h_luma_intra_8, 2,4,8,0x80 740 lea r3, [r1*3] 741 sub r0, 4 742 lea r2, [r0+r3] 743 %define pix_tmp rsp 744 745 ; transpose 8x16 -> tmp space 746 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) 747 lea r0, [r0+r1*8] 748 lea r2, [r2+r1*8] 749 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) 750 751 lea r0, [pix_tmp+0x40] 752 PUSH dword r3m 753 PUSH dword r2m 754 PUSH dword 16 755 PUSH r0 756 call deblock_%1_luma_intra_8 757%ifidn %1, v8 758 add dword [rsp], 8 ; pix_tmp+8 759 call deblock_%1_luma_intra_8 760%endif 761 ADD esp, 16 762 763 mov r1, r1m 764 mov r0, r0mp 765 lea r3, [r1*3] 766 sub r0, 4 767 lea r2, [r0+r3] 768 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) 769 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) 770 lea r0, [r0+r1*8] 771 lea r2, [r2+r1*8] 772 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) 773 RET 774%endif ; ARCH_X86_64 775%endmacro ; DEBLOCK_LUMA_INTRA 776 777INIT_XMM sse2 778DEBLOCK_LUMA_INTRA v 779%if HAVE_AVX_EXTERNAL 780INIT_XMM avx 781DEBLOCK_LUMA_INTRA v 782%endif 783%if ARCH_X86_64 == 0 784INIT_MMX mmxext 785DEBLOCK_LUMA_INTRA v8 786%endif 787 788INIT_MMX mmxext 789 790%macro CHROMA_V_START 0 791 dec r2d ; alpha-1 792 dec r3d ; beta-1 793 mov t5, r0 794 sub t5, r1 795 sub t5, r1 796%endmacro 797 798%macro CHROMA_H_START 0 799 dec r2d 800 dec r3d 801 sub r0, 2 802 lea t6, [r1*3] 803 mov t5, r0 804 add r0, t6 805%endmacro 806 807%define t5 r5 808%define t6 r6 809 810;----------------------------------------------------------------------------- 811; void ff_deblock_v_chroma(uint8_t *pix, int stride, int alpha, int beta, 812; int8_t *tc0) 813;----------------------------------------------------------------------------- 814cglobal deblock_v_chroma_8, 5,6 815 CHROMA_V_START 816 movq m0, [t5] 817 movq m1, [t5+r1] 818 movq m2, [r0] 819 movq m3, [r0+r1] 820 call ff_chroma_inter_body_mmxext 821 movq [t5+r1], m1 822 movq [r0], m2 823 RET 824 825;----------------------------------------------------------------------------- 826; void ff_deblock_h_chroma(uint8_t *pix, int stride, int alpha, int beta, 827; int8_t *tc0) 828;----------------------------------------------------------------------------- 829cglobal deblock_h_chroma_8, 5,7 830%if ARCH_X86_64 831 ; This could use the red zone on 64 bit unix to avoid the stack pointer 832 ; readjustment, but valgrind assumes the red zone is clobbered on 833 ; function calls and returns. 834 sub rsp, 16 835 %define buf0 [rsp] 836 %define buf1 [rsp+8] 837%else 838 %define buf0 r0m 839 %define buf1 r2m 840%endif 841 CHROMA_H_START 842 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) 843 movq buf0, m0 844 movq buf1, m3 845 LOAD_MASK r2d, r3d 846 movd m6, [r4] ; tc0 847 punpcklbw m6, m6 848 pand m7, m6 849 DEBLOCK_P0_Q0 850 movq m0, buf0 851 movq m3, buf1 852 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) 853%if ARCH_X86_64 854 add rsp, 16 855%endif 856 RET 857 858ALIGN 16 859ff_chroma_inter_body_mmxext: 860 LOAD_MASK r2d, r3d 861 movd m6, [r4] ; tc0 862 punpcklbw m6, m6 863 pand m7, m6 864 DEBLOCK_P0_Q0 865 ret 866 867 868 869; in: %1=p0 %2=p1 %3=q1 870; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 871%macro CHROMA_INTRA_P0 3 872 movq m4, %1 873 pxor m4, %3 874 pand m4, [pb_1] ; m4 = (p0^q1)&1 875 pavgb %1, %3 876 psubusb %1, m4 877 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) 878%endmacro 879 880%define t5 r4 881%define t6 r5 882 883;------------------------------------------------------------------------------ 884; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta) 885;------------------------------------------------------------------------------ 886cglobal deblock_v_chroma_intra_8, 4,5 887 CHROMA_V_START 888 movq m0, [t5] 889 movq m1, [t5+r1] 890 movq m2, [r0] 891 movq m3, [r0+r1] 892 call ff_chroma_intra_body_mmxext 893 movq [t5+r1], m1 894 movq [r0], m2 895 RET 896 897;------------------------------------------------------------------------------ 898; void ff_deblock_h_chroma_intra(uint8_t *pix, int stride, int alpha, int beta) 899;------------------------------------------------------------------------------ 900cglobal deblock_h_chroma_intra_8, 4,6 901 CHROMA_H_START 902 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) 903 call ff_chroma_intra_body_mmxext 904 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) 905 RET 906 907ALIGN 16 908ff_chroma_intra_body_mmxext: 909 LOAD_MASK r2d, r3d 910 movq m5, m1 911 movq m6, m2 912 CHROMA_INTRA_P0 m1, m0, m3 913 CHROMA_INTRA_P0 m2, m3, m0 914 psubb m1, m5 915 psubb m2, m6 916 pand m1, m7 917 pand m2, m7 918 paddb m1, m5 919 paddb m2, m6 920 ret 921 922;----------------------------------------------------------------------------- 923; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40], 924; int8_t ref[2][40], int16_t mv[2][40][2], 925; int bidir, int edges, int step, 926; int mask_mv0, int mask_mv1, int field); 927; 928; bidir is 0 or 1 929; edges is 1 or 4 930; step is 1 or 2 931; mask_mv0 is 0 or 3 932; mask_mv1 is 0 or 1 933; field is 0 or 1 934;----------------------------------------------------------------------------- 935%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv, 936 ; dir, d_idx, mask_dir, bidir 937%define edgesd %1 938%define stepd %2 939%define mask_mvd %3 940%define dir %4 941%define d_idx %5 942%define mask_dir %6 943%define bidir %7 944 xor b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step) 945%%.b_idx_loop: 946%if mask_dir == 0 947 pxor m0, m0 948%endif 949 test b_idxd, dword mask_mvd 950 jnz %%.skip_loop_iter ; if (!(b_idx & mask_mv)) 951%if bidir == 1 952 movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] } 953 punpckldq m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] } 954 pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] } 955 pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] } 956 pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] } 957 psubb m0, m2 ; { ref0[b] != ref0[bn], 958 ; ref0[b] != ref1[bn] } 959 psubb m1, m3 ; { ref1[b] != ref1[bn], 960 ; ref1[b] != ref0[bn] } 961 962 por m0, m1 963 mova m1, [mvq+b_idxq*4+(d_idx+12)*4] 964 mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] 965 mova m3, m1 966 mova m4, m2 967 psubw m1, [mvq+b_idxq*4+12*4] 968 psubw m2, [mvq+b_idxq*4+12*4+mmsize] 969 psubw m3, [mvq+b_idxq*4+52*4] 970 psubw m4, [mvq+b_idxq*4+52*4+mmsize] 971 packsswb m1, m2 972 packsswb m3, m4 973 paddb m1, m6 974 paddb m3, m6 975 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit 976 psubusb m3, m5 977 packsswb m1, m3 978 979 por m0, m1 980 mova m1, [mvq+b_idxq*4+(d_idx+52)*4] 981 mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize] 982 mova m3, m1 983 mova m4, m2 984 psubw m1, [mvq+b_idxq*4+12*4] 985 psubw m2, [mvq+b_idxq*4+12*4+mmsize] 986 psubw m3, [mvq+b_idxq*4+52*4] 987 psubw m4, [mvq+b_idxq*4+52*4+mmsize] 988 packsswb m1, m2 989 packsswb m3, m4 990 paddb m1, m6 991 paddb m3, m6 992 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit 993 psubusb m3, m5 994 packsswb m1, m3 995 996 pshufw m1, m1, 0x4E 997 por m0, m1 998 pshufw m1, m0, 0x4E 999 pminub m0, m1 1000%else ; bidir == 0 1001 movd m0, [refq+b_idxq+12] 1002 psubb m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn] 1003 1004 mova m1, [mvq+b_idxq*4+12*4] 1005 mova m2, [mvq+b_idxq*4+12*4+mmsize] 1006 psubw m1, [mvq+b_idxq*4+(d_idx+12)*4] 1007 psubw m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] 1008 packsswb m1, m2 1009 paddb m1, m6 1010 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit 1011 packsswb m1, m1 1012 por m0, m1 1013%endif ; bidir == 1/0 1014 1015%%.skip_loop_iter: 1016 movd m1, [nnzq+b_idxq+12] 1017 por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn] 1018 1019 pminub m1, m7 1020 pminub m0, m7 1021 psllw m1, 1 1022 pxor m2, m2 1023 pmaxub m1, m0 1024 punpcklbw m1, m2 1025 movq [bsq+b_idxq+32*dir], m1 1026 1027 add b_idxd, dword stepd 1028 cmp b_idxd, dword edgesd 1029 jl %%.b_idx_loop 1030%endmacro 1031 1032INIT_MMX mmxext 1033cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \ 1034 step, mask_mv0, mask_mv1, field 1035%define b_idxq bidirq 1036%define b_idxd bidird 1037 cmp dword fieldm, 0 1038 mova m7, [pb_1] 1039 mova m5, [pb_3] 1040 je .nofield 1041 mova m5, [pb_3_1] 1042.nofield: 1043 mova m6, m5 1044 paddb m5, m5 1045 1046 shl dword stepd, 3 1047 shl dword edgesd, 3 1048%if ARCH_X86_32 1049%define mask_mv0d mask_mv0m 1050%define mask_mv1d mask_mv1m 1051%endif 1052 shl dword mask_mv1d, 3 1053 shl dword mask_mv0d, 3 1054 1055 cmp dword bidird, 0 1056 jne .bidir 1057 loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 0 1058 loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 0 1059 1060 mova m0, [bsq+mmsize*0] 1061 mova m1, [bsq+mmsize*1] 1062 mova m2, [bsq+mmsize*2] 1063 mova m3, [bsq+mmsize*3] 1064 TRANSPOSE4x4W 0, 1, 2, 3, 4 1065 mova [bsq+mmsize*0], m0 1066 mova [bsq+mmsize*1], m1 1067 mova [bsq+mmsize*2], m2 1068 mova [bsq+mmsize*3], m3 1069 RET 1070 1071.bidir: 1072 loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 1 1073 loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 1 1074 1075 mova m0, [bsq+mmsize*0] 1076 mova m1, [bsq+mmsize*1] 1077 mova m2, [bsq+mmsize*2] 1078 mova m3, [bsq+mmsize*3] 1079 TRANSPOSE4x4W 0, 1, 2, 3, 4 1080 mova [bsq+mmsize*0], m0 1081 mova [bsq+mmsize*1], m1 1082 mova [bsq+mmsize*2], m2 1083 mova [bsq+mmsize*3], m3 1084 RET 1085