1;***************************************************************************** 2;* SSE2-optimized HEVC deblocking code 3;***************************************************************************** 4;* Copyright (C) 2013 VTT 5;* 6;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi> 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION_RODATA 28 29cextern pw_1023 30%define pw_pixel_max_10 pw_1023 31pw_pixel_max_12: times 8 dw ((1 << 12)-1) 32pw_m2: times 8 dw -2 33pd_1 : times 4 dd 1 34 35cextern pw_4 36cextern pw_8 37cextern pw_m1 38 39SECTION .text 40INIT_XMM sse2 41 42; in: 8 rows of 4 bytes in %4..%11 43; out: 4 rows of 8 words in m0..m3 44%macro TRANSPOSE4x8B_LOAD 8 45 movd m0, %1 46 movd m2, %2 47 movd m1, %3 48 movd m3, %4 49 50 punpcklbw m0, m2 51 punpcklbw m1, m3 52 punpcklwd m0, m1 53 54 movd m4, %5 55 movd m6, %6 56 movd m5, %7 57 movd m3, %8 58 59 punpcklbw m4, m6 60 punpcklbw m5, m3 61 punpcklwd m4, m5 62 63 punpckhdq m2, m0, m4 64 punpckldq m0, m4 65 66 pxor m5, m5 67 punpckhbw m1, m0, m5 68 punpcklbw m0, m5 69 punpckhbw m3, m2, m5 70 punpcklbw m2, m5 71%endmacro 72 73; in: 4 rows of 8 words in m0..m3 74; out: 8 rows of 4 bytes in %1..%8 75%macro TRANSPOSE8x4B_STORE 8 76 packuswb m0, m2 77 packuswb m1, m3 78 SBUTTERFLY bw, 0, 1, 2 79 SBUTTERFLY wd, 0, 1, 2 80 81 movd %1, m0 82 pshufd m0, m0, 0x39 83 movd %2, m0 84 pshufd m0, m0, 0x39 85 movd %3, m0 86 pshufd m0, m0, 0x39 87 movd %4, m0 88 89 movd %5, m1 90 pshufd m1, m1, 0x39 91 movd %6, m1 92 pshufd m1, m1, 0x39 93 movd %7, m1 94 pshufd m1, m1, 0x39 95 movd %8, m1 96%endmacro 97 98; in: 8 rows of 4 words in %4..%11 99; out: 4 rows of 8 words in m0..m3 100%macro TRANSPOSE4x8W_LOAD 8 101 movq m0, %1 102 movq m2, %2 103 movq m1, %3 104 movq m3, %4 105 106 punpcklwd m0, m2 107 punpcklwd m1, m3 108 punpckhdq m2, m0, m1 109 punpckldq m0, m1 110 111 movq m4, %5 112 movq m6, %6 113 movq m5, %7 114 movq m3, %8 115 116 punpcklwd m4, m6 117 punpcklwd m5, m3 118 punpckhdq m6, m4, m5 119 punpckldq m4, m5 120 121 punpckhqdq m1, m0, m4 122 punpcklqdq m0, m4 123 punpckhqdq m3, m2, m6 124 punpcklqdq m2, m6 125 126%endmacro 127 128; in: 4 rows of 8 words in m0..m3 129; out: 8 rows of 4 words in %1..%8 130%macro TRANSPOSE8x4W_STORE 9 131 TRANSPOSE4x4W 0, 1, 2, 3, 4 132 133 pxor m5, m5; zeros reg 134 CLIPW m0, m5, %9 135 CLIPW m1, m5, %9 136 CLIPW m2, m5, %9 137 CLIPW m3, m5, %9 138 139 movq %1, m0 140 movhps %2, m0 141 movq %3, m1 142 movhps %4, m1 143 movq %5, m2 144 movhps %6, m2 145 movq %7, m3 146 movhps %8, m3 147%endmacro 148 149; in: 8 rows of 8 bytes in %1..%8 150; out: 8 rows of 8 words in m0..m7 151%macro TRANSPOSE8x8B_LOAD 8 152 movq m7, %1 153 movq m2, %2 154 movq m1, %3 155 movq m3, %4 156 157 punpcklbw m7, m2 158 punpcklbw m1, m3 159 punpcklwd m3, m7, m1 160 punpckhwd m7, m1 161 162 movq m4, %5 163 movq m6, %6 164 movq m5, %7 165 movq m15, %8 166 167 punpcklbw m4, m6 168 punpcklbw m5, m15 169 punpcklwd m9, m4, m5 170 punpckhwd m4, m5 171 172 punpckldq m1, m3, m9; 0, 1 173 punpckhdq m3, m9; 2, 3 174 175 punpckldq m5, m7, m4; 4, 5 176 punpckhdq m7, m4; 6, 7 177 178 pxor m13, m13 179 180 punpcklbw m0, m1, m13; 0 in 16 bit 181 punpckhbw m1, m13; 1 in 16 bit 182 183 punpcklbw m2, m3, m13; 2 184 punpckhbw m3, m13; 3 185 186 punpcklbw m4, m5, m13; 4 187 punpckhbw m5, m13; 5 188 189 punpcklbw m6, m7, m13; 6 190 punpckhbw m7, m13; 7 191%endmacro 192 193 194; in: 8 rows of 8 words in m0..m8 195; out: 8 rows of 8 bytes in %1..%8 196%macro TRANSPOSE8x8B_STORE 8 197 packuswb m0, m4 198 packuswb m1, m5 199 packuswb m2, m6 200 packuswb m3, m7 201 TRANSPOSE2x4x4B 0, 1, 2, 3, 4 202 203 movq %1, m0 204 movhps %2, m0 205 movq %3, m1 206 movhps %4, m1 207 movq %5, m2 208 movhps %6, m2 209 movq %7, m3 210 movhps %8, m3 211%endmacro 212 213; in: 8 rows of 8 words in %1..%8 214; out: 8 rows of 8 words in m0..m7 215%macro TRANSPOSE8x8W_LOAD 8 216 movdqu m0, %1 217 movdqu m1, %2 218 movdqu m2, %3 219 movdqu m3, %4 220 movdqu m4, %5 221 movdqu m5, %6 222 movdqu m6, %7 223 movdqu m7, %8 224 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 225%endmacro 226 227; in: 8 rows of 8 words in m0..m8 228; out: 8 rows of 8 words in %1..%8 229%macro TRANSPOSE8x8W_STORE 9 230 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 231 232 pxor m8, m8 233 CLIPW m0, m8, %9 234 CLIPW m1, m8, %9 235 CLIPW m2, m8, %9 236 CLIPW m3, m8, %9 237 CLIPW m4, m8, %9 238 CLIPW m5, m8, %9 239 CLIPW m6, m8, %9 240 CLIPW m7, m8, %9 241 242 movdqu %1, m0 243 movdqu %2, m1 244 movdqu %3, m2 245 movdqu %4, m3 246 movdqu %5, m4 247 movdqu %6, m5 248 movdqu %7, m6 249 movdqu %8, m7 250%endmacro 251 252 253; in: %2 clobbered 254; out: %1 255; mask in m11 256; clobbers m10 257%macro MASKED_COPY 2 258 pand %2, m11 ; and mask 259 pandn m10, m11, %1; and -mask 260 por %2, m10 261 mova %1, %2 262%endmacro 263 264; in: %2 clobbered 265; out: %1 266; mask in %3, will be clobbered 267%macro MASKED_COPY2 3 268 pand %2, %3 ; and mask 269 pandn %3, %1; and -mask 270 por %2, %3 271 mova %1, %2 272%endmacro 273 274ALIGN 16 275; input in m0 ... m3 and tcs in r2. Output in m1 and m2 276%macro CHROMA_DEBLOCK_BODY 1 277 psubw m4, m2, m1; q0 - p0 278 psubw m5, m0, m3; p1 - q1 279 psllw m4, 2; << 2 280 paddw m5, m4; 281 282 ;tc calculations 283 movq m6, [tcq]; tc0 284 punpcklwd m6, m6 285 pshufd m6, m6, 0xA0; tc0, tc1 286%if cpuflag(ssse3) 287 psignw m4, m6, [pw_m1]; -tc0, -tc1 288%else 289 pmullw m4, m6, [pw_m1]; -tc0, -tc1 290%endif 291 ;end tc calculations 292 293 paddw m5, [pw_4]; +4 294 psraw m5, 3; >> 3 295 296%if %1 > 8 297 psllw m4, %1-8; << (BIT_DEPTH - 8) 298 psllw m6, %1-8; << (BIT_DEPTH - 8) 299%endif 300 pmaxsw m5, m4 301 pminsw m5, m6 302 paddw m1, m5; p0 + delta0 303 psubw m2, m5; q0 - delta0 304%endmacro 305 306; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6 307%macro LUMA_DEBLOCK_BODY 2 308 psllw m9, m2, 1; *2 309 psubw m10, m1, m9 310 paddw m10, m3 311 ABS1 m10, m11 ; 0dp0, 0dp3 , 1dp0, 1dp3 312 313 psllw m9, m5, 1; *2 314 psubw m11, m6, m9 315 paddw m11, m4 316 ABS1 m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3 317 318 ;beta calculations 319%if %1 > 8 320 shl betaq, %1 - 8 321%endif 322 movd m13, betad 323 SPLATW m13, m13, 0 324 ;end beta calculations 325 326 paddw m9, m10, m11; 0d0, 0d3 , 1d0, 1d3 327 328 pshufhw m14, m9, 0x0f ;0b00001111; 0d3 0d3 0d0 0d0 in high 329 pshuflw m14, m14, 0x0f ;0b00001111; 1d3 1d3 1d0 1d0 in low 330 331 pshufhw m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3 332 pshuflw m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3 333 334 paddw m14, m9; 0d0+0d3, 1d0+1d3 335 336 ;compare 337 pcmpgtw m15, m13, m14 338 movmskps r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1) 339 test r13, r13 340 je .bypassluma 341 342 ;weak / strong decision compare to beta_2 343 psraw m15, m13, 2; beta >> 2 344 psllw m8, m9, 1; 345 pcmpgtw m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2 346 movmskps r6, m15; 347 ;end weak / strong decision 348 349 ; weak filter nd_p/q calculation 350 pshufd m8, m10, 0x31 351 psrld m8, 16 352 paddw m8, m10 353 movd r7d, m8 354 pshufd m8, m8, 0x4E 355 movd r8d, m8 356 357 pshufd m8, m11, 0x31 358 psrld m8, 16 359 paddw m8, m11 360 movd r9d, m8 361 pshufd m8, m8, 0x4E 362 movd r10d, m8 363 ; end calc for weak filter 364 365 ; filtering mask 366 mov r11, r13 367 shr r11, 3 368 movd m15, r11d 369 and r13, 1 370 movd m11, r13d 371 shufps m11, m15, 0 372 shl r11, 1 373 or r13, r11 374 375 pcmpeqd m11, [pd_1]; filtering mask 376 377 ;decide between strong and weak filtering 378 ;tc25 calculations 379 mov r11d, [tcq]; 380%if %1 > 8 381 shl r11, %1 - 8 382%endif 383 movd m8, r11d; tc0 384 mov r3d, [tcq+4]; 385%if %1 > 8 386 shl r3, %1 - 8 387%endif 388 add r11d, r3d; tc0 + tc1 389 jz .bypassluma 390 movd m9, r3d; tc1 391 punpcklwd m8, m8 392 punpcklwd m9, m9 393 shufps m8, m9, 0; tc0, tc1 394 mova m9, m8 395 psllw m8, 2; tc << 2 396 pavgw m8, m9; tc25 = ((tc * 5 + 1) >> 1) 397 ;end tc25 calculations 398 399 ;----beta_3 comparison----- 400 psubw m12, m0, m3; p3 - p0 401 ABS1 m12, m14; abs(p3 - p0) 402 403 psubw m15, m7, m4; q3 - q0 404 ABS1 m15, m14; abs(q3 - q0) 405 406 paddw m12, m15; abs(p3 - p0) + abs(q3 - q0) 407 408 pshufhw m12, m12, 0xf0 ;0b11110000; 409 pshuflw m12, m12, 0xf0 ;0b11110000; 410 411 psraw m13, 3; beta >> 3 412 pcmpgtw m13, m12; 413 movmskps r11, m13; 414 and r6, r11; strong mask , beta_2 and beta_3 comparisons 415 ;----beta_3 comparison end----- 416 ;----tc25 comparison--- 417 psubw m12, m3, m4; p0 - q0 418 ABS1 m12, m14; abs(p0 - q0) 419 420 pshufhw m12, m12, 0xf0 ;0b11110000; 421 pshuflw m12, m12, 0xf0 ;0b11110000; 422 423 pcmpgtw m8, m12; tc25 comparisons 424 movmskps r11, m8; 425 and r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons 426 ;----tc25 comparison end--- 427 mov r11, r6; 428 shr r11, 1; 429 and r6, r11; strong mask, bits 2 and 0 430 431 pmullw m14, m9, [pw_m2]; -tc * 2 432 paddw m9, m9 433 434 and r6, 5; 0b101 435 mov r11, r6; strong mask 436 shr r6, 2; 437 movd m12, r6d; store to xmm for mask generation 438 shl r6, 1 439 and r11, 1 440 movd m10, r11d; store to xmm for mask generation 441 or r6, r11; final strong mask, bits 1 and 0 442 jz .weakfilter 443 444 shufps m10, m12, 0 445 pcmpeqd m10, [pd_1]; strong mask 446 447 mova m13, [pw_4]; 4 in every cell 448 pand m11, m10; combine filtering mask and strong mask 449 paddw m12, m2, m3; p1 + p0 450 paddw m12, m4; p1 + p0 + q0 451 mova m10, m12; copy 452 paddw m12, m12; 2*p1 + 2*p0 + 2*q0 453 paddw m12, m1; p2 + 2*p1 + 2*p0 + 2*q0 454 paddw m12, m5; p2 + 2*p1 + 2*p0 + 2*q0 + q1 455 paddw m12, m13; p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 456 psraw m12, 3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) 457 psubw m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0 458 pmaxsw m12, m14 459 pminsw m12, m9; av_clip( , -2 * tc, 2 * tc) 460 paddw m12, m3; p0' 461 462 paddw m15, m1, m10; p2 + p1 + p0 + q0 463 psrlw m13, 1; 2 in every cell 464 paddw m15, m13; p2 + p1 + p0 + q0 + 2 465 psraw m15, 2; (p2 + p1 + p0 + q0 + 2) >> 2 466 psubw m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1 467 pmaxsw m15, m14 468 pminsw m15, m9; av_clip( , -2 * tc, 2 * tc) 469 paddw m15, m2; p1' 470 471 paddw m8, m1, m0; p3 + p2 472 paddw m8, m8; 2*p3 + 2*p2 473 paddw m8, m1; 2*p3 + 3*p2 474 paddw m8, m10; 2*p3 + 3*p2 + p1 + p0 + q0 475 paddw m13, m13 476 paddw m8, m13; 2*p3 + 3*p2 + p1 + p0 + q0 + 4 477 psraw m8, 3; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3 478 psubw m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2 479 pmaxsw m8, m14 480 pminsw m8, m9; av_clip( , -2 * tc, 2 * tc) 481 paddw m8, m1; p2' 482 MASKED_COPY m1, m8 483 484 paddw m8, m3, m4; p0 + q0 485 paddw m8, m5; p0 + q0 + q1 486 paddw m8, m8; 2*p0 + 2*q0 + 2*q1 487 paddw m8, m2; p1 + 2*p0 + 2*q0 + 2*q1 488 paddw m8, m6; p1 + 2*p0 + 2*q0 + 2*q1 + q2 489 paddw m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 490 psraw m8, 3; (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3 491 psubw m8, m4; 492 pmaxsw m8, m14 493 pminsw m8, m9; av_clip( , -2 * tc, 2 * tc) 494 paddw m8, m4; q0' 495 MASKED_COPY m2, m15 496 497 paddw m15, m3, m4; p0 + q0 498 paddw m15, m5; p0 + q0 + q1 499 mova m10, m15; 500 paddw m15, m6; p0 + q0 + q1 + q2 501 psrlw m13, 1; 2 in every cell 502 paddw m15, m13; p0 + q0 + q1 + q2 + 2 503 psraw m15, 2; (p0 + q0 + q1 + q2 + 2) >> 2 504 psubw m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1 505 pmaxsw m15, m14 506 pminsw m15, m9; av_clip( , -2 * tc, 2 * tc) 507 paddw m15, m5; q1' 508 509 paddw m13, m7; q3 + 2 510 paddw m13, m6; q3 + q2 + 2 511 paddw m13, m13; 2*q3 + 2*q2 + 4 512 paddw m13, m6; 2*q3 + 3*q2 + 4 513 paddw m13, m10; 2*q3 + 3*q2 + q1 + q0 + p0 + 4 514 psraw m13, 3; (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3 515 psubw m13, m6; ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2 516 pmaxsw m13, m14 517 pminsw m13, m9; av_clip( , -2 * tc, 2 * tc) 518 paddw m13, m6; q2' 519 520 MASKED_COPY m6, m13 521 MASKED_COPY m5, m15 522 MASKED_COPY m4, m8 523 MASKED_COPY m3, m12 524 525.weakfilter: 526 not r6; strong mask -> weak mask 527 and r6, r13; final weak filtering mask, bits 0 and 1 528 jz .store 529 530 ; weak filtering mask 531 mov r11, r6 532 shr r11, 1 533 movd m12, r11d 534 and r6, 1 535 movd m11, r6d 536 shufps m11, m12, 0 537 pcmpeqd m11, [pd_1]; filtering mask 538 539 mov r13, betaq 540 shr r13, 1; 541 add betaq, r13 542 shr betaq, 3; ((beta + (beta >> 1)) >> 3)) 543 544 mova m13, [pw_8] 545 psubw m12, m4, m3 ; q0 - p0 546 psllw m10, m12, 3; 8 * (q0 - p0) 547 paddw m12, m10 ; 9 * (q0 - p0) 548 549 psubw m10, m5, m2 ; q1 - p1 550 psllw m8, m10, 1; 2 * ( q1 - p1 ) 551 paddw m10, m8; 3 * ( q1 - p1 ) 552 psubw m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 ) 553 paddw m12, m13; + 8 554 psraw m12, 4; >> 4 , delta0 555 PABSW m13, m12; abs(delta0) 556 557 558 psllw m10, m9, 2; 8 * tc 559 paddw m10, m9; 10 * tc 560 pcmpgtw m10, m13 561 pand m11, m10 562 563 psraw m9, 1; tc * 2 -> tc 564 psraw m14, 1; -tc * 2 -> -tc 565 566 pmaxsw m12, m14 567 pminsw m12, m9; av_clip(delta0, -tc, tc) 568 569 psraw m9, 1; tc -> tc / 2 570%if cpuflag(ssse3) 571 psignw m14, m9, [pw_m1]; -tc / 2 572%else 573 pmullw m14, m9, [pw_m1]; -tc / 2 574%endif 575 576 pavgw m15, m1, m3; (p2 + p0 + 1) >> 1 577 psubw m15, m2; ((p2 + p0 + 1) >> 1) - p1 578 paddw m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0 579 psraw m15, 1; (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1 580 pmaxsw m15, m14 581 pminsw m15, m9; av_clip(deltap1, -tc/2, tc/2) 582 paddw m15, m2; p1' 583 584 ;beta calculations 585 movd m10, betad 586 SPLATW m10, m10, 0 587 588 movd m13, r7d; 1dp0 + 1dp3 589 movd m8, r8d; 0dp0 + 0dp3 590 punpcklwd m8, m8 591 punpcklwd m13, m13 592 shufps m13, m8, 0; 593 pcmpgtw m8, m10, m13 594 pand m8, m11 595 ;end beta calculations 596 MASKED_COPY2 m2, m15, m8; write p1' 597 598 pavgw m8, m6, m4; (q2 + q0 + 1) >> 1 599 psubw m8, m5; ((q2 + q0 + 1) >> 1) - q1 600 psubw m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0) 601 psraw m8, 1; ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1 602 pmaxsw m8, m14 603 pminsw m8, m9; av_clip(deltaq1, -tc/2, tc/2) 604 paddw m8, m5; q1' 605 606 movd m13, r9d; 607 movd m15, r10d; 608 punpcklwd m15, m15 609 punpcklwd m13, m13 610 shufps m13, m15, 0; dq0 + dq3 611 612 pcmpgtw m10, m13; compare to ((beta+(beta>>1))>>3) 613 pand m10, m11 614 MASKED_COPY2 m5, m8, m10; write q1' 615 616 paddw m15, m3, m12 ; p0 + delta0 617 MASKED_COPY m3, m15 618 619 psubw m8, m4, m12 ; q0 - delta0 620 MASKED_COPY m4, m8 621%endmacro 622 623;----------------------------------------------------------------------------- 624; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc, 625; uint8_t *_no_p, uint8_t *_no_q); 626;----------------------------------------------------------------------------- 627%macro LOOP_FILTER_CHROMA 0 628cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride 629 sub pixq, 2 630 lea r3strideq, [3*strideq] 631 mov pix0q, pixq 632 add pixq, r3strideq 633 TRANSPOSE4x8B_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq) 634 CHROMA_DEBLOCK_BODY 8 635 TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq) 636 RET 637 638cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride 639 sub pixq, 4 640 lea r3strideq, [3*strideq] 641 mov pix0q, pixq 642 add pixq, r3strideq 643 TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq) 644 CHROMA_DEBLOCK_BODY 10 645 TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10] 646 RET 647 648cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride 649 sub pixq, 4 650 lea r3strideq, [3*strideq] 651 mov pix0q, pixq 652 add pixq, r3strideq 653 TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq) 654 CHROMA_DEBLOCK_BODY 12 655 TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12] 656 RET 657 658;----------------------------------------------------------------------------- 659; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc, 660; uint8_t *_no_p, uint8_t *_no_q); 661;----------------------------------------------------------------------------- 662cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0 663 mov pix0q, pixq 664 sub pix0q, strideq 665 sub pix0q, strideq 666 movq m0, [pix0q]; p1 667 movq m1, [pix0q+strideq]; p0 668 movq m2, [pixq]; q0 669 movq m3, [pixq+strideq]; q1 670 pxor m5, m5; zeros reg 671 punpcklbw m0, m5 672 punpcklbw m1, m5 673 punpcklbw m2, m5 674 punpcklbw m3, m5 675 CHROMA_DEBLOCK_BODY 8 676 packuswb m1, m2 677 movh[pix0q+strideq], m1 678 movhps [pixq], m1 679 RET 680 681cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0 682 mov pix0q, pixq 683 sub pix0q, strideq 684 sub pix0q, strideq 685 movu m0, [pix0q]; p1 686 movu m1, [pix0q+strideq]; p0 687 movu m2, [pixq]; q0 688 movu m3, [pixq+strideq]; q1 689 CHROMA_DEBLOCK_BODY 10 690 pxor m5, m5; zeros reg 691 CLIPW m1, m5, [pw_pixel_max_10] 692 CLIPW m2, m5, [pw_pixel_max_10] 693 movu [pix0q+strideq], m1 694 movu [pixq], m2 695 RET 696 697cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0 698 mov pix0q, pixq 699 sub pix0q, strideq 700 sub pix0q, strideq 701 movu m0, [pix0q]; p1 702 movu m1, [pix0q+strideq]; p0 703 movu m2, [pixq]; q0 704 movu m3, [pixq+strideq]; q1 705 CHROMA_DEBLOCK_BODY 12 706 pxor m5, m5; zeros reg 707 CLIPW m1, m5, [pw_pixel_max_12] 708 CLIPW m2, m5, [pw_pixel_max_12] 709 movu [pix0q+strideq], m1 710 movu [pixq], m2 711 RET 712%endmacro 713 714INIT_XMM sse2 715LOOP_FILTER_CHROMA 716INIT_XMM avx 717LOOP_FILTER_CHROMA 718 719%if ARCH_X86_64 720%macro LOOP_FILTER_LUMA 0 721;----------------------------------------------------------------------------- 722; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta, 723; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q); 724;----------------------------------------------------------------------------- 725cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 726 sub pixq, 4 727 lea pix0q, [3 * r1] 728 mov src3strideq, pixq 729 add pixq, pix0q 730 TRANSPOSE8x8B_LOAD PASS8ROWS(src3strideq, pixq, r1, pix0q) 731 LUMA_DEBLOCK_BODY 8, v 732.store: 733 TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q) 734.bypassluma: 735 RET 736 737cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 738 sub pixq, 8 739 lea pix0q, [3 * strideq] 740 mov src3strideq, pixq 741 add pixq, pix0q 742 TRANSPOSE8x8W_LOAD PASS8ROWS(src3strideq, pixq, strideq, pix0q) 743 LUMA_DEBLOCK_BODY 10, v 744.store: 745 TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10] 746.bypassluma: 747 RET 748 749cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 750 sub pixq, 8 751 lea pix0q, [3 * strideq] 752 mov src3strideq, pixq 753 add pixq, pix0q 754 TRANSPOSE8x8W_LOAD PASS8ROWS(src3strideq, pixq, strideq, pix0q) 755 LUMA_DEBLOCK_BODY 12, v 756.store: 757 TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12] 758.bypassluma: 759 RET 760 761;----------------------------------------------------------------------------- 762; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta, 763; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q); 764;----------------------------------------------------------------------------- 765cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 766 lea src3strideq, [3 * strideq] 767 mov pix0q, pixq 768 sub pix0q, src3strideq 769 sub pix0q, strideq 770 movq m0, [pix0q]; p3 771 movq m1, [pix0q + strideq]; p2 772 movq m2, [pix0q + 2 * strideq]; p1 773 movq m3, [pix0q + src3strideq]; p0 774 movq m4, [pixq]; q0 775 movq m5, [pixq + strideq]; q1 776 movq m6, [pixq + 2 * strideq]; q2 777 movq m7, [pixq + src3strideq]; q3 778 pxor m8, m8 779 punpcklbw m0, m8 780 punpcklbw m1, m8 781 punpcklbw m2, m8 782 punpcklbw m3, m8 783 punpcklbw m4, m8 784 punpcklbw m5, m8 785 punpcklbw m6, m8 786 punpcklbw m7, m8 787 LUMA_DEBLOCK_BODY 8, h 788.store: 789 packuswb m1, m2 790 packuswb m3, m4 791 packuswb m5, m6 792 movh [pix0q + strideq], m1 793 movhps [pix0q + 2 * strideq], m1 794 movh [pix0q + src3strideq], m3 795 movhps [pixq ], m3 796 movh [pixq + strideq], m5 797 movhps [pixq + 2 * strideq], m5 798.bypassluma: 799 RET 800 801cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 802 lea src3strideq, [3 * strideq] 803 mov pix0q, pixq 804 sub pix0q, src3strideq 805 sub pix0q, strideq 806 movdqu m0, [pix0q]; p3 807 movdqu m1, [pix0q + strideq]; p2 808 movdqu m2, [pix0q + 2 * strideq]; p1 809 movdqu m3, [pix0q + src3strideq]; p0 810 movdqu m4, [pixq]; q0 811 movdqu m5, [pixq + strideq]; q1 812 movdqu m6, [pixq + 2 * strideq]; q2 813 movdqu m7, [pixq + src3strideq]; q3 814 LUMA_DEBLOCK_BODY 10, h 815.store: 816 pxor m8, m8; zeros reg 817 CLIPW m1, m8, [pw_pixel_max_10] 818 CLIPW m2, m8, [pw_pixel_max_10] 819 CLIPW m3, m8, [pw_pixel_max_10] 820 CLIPW m4, m8, [pw_pixel_max_10] 821 CLIPW m5, m8, [pw_pixel_max_10] 822 CLIPW m6, m8, [pw_pixel_max_10] 823 movdqu [pix0q + strideq], m1; p2 824 movdqu [pix0q + 2 * strideq], m2; p1 825 movdqu [pix0q + src3strideq], m3; p0 826 movdqu [pixq ], m4; q0 827 movdqu [pixq + strideq], m5; q1 828 movdqu [pixq + 2 * strideq], m6; q2 829.bypassluma: 830 RET 831 832cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 833 lea src3strideq, [3 * strideq] 834 mov pix0q, pixq 835 sub pix0q, src3strideq 836 sub pix0q, strideq 837 movdqu m0, [pix0q]; p3 838 movdqu m1, [pix0q + strideq]; p2 839 movdqu m2, [pix0q + 2 * strideq]; p1 840 movdqu m3, [pix0q + src3strideq]; p0 841 movdqu m4, [pixq]; q0 842 movdqu m5, [pixq + strideq]; q1 843 movdqu m6, [pixq + 2 * strideq]; q2 844 movdqu m7, [pixq + src3strideq]; q3 845 LUMA_DEBLOCK_BODY 12, h 846.store: 847 pxor m8, m8; zeros reg 848 CLIPW m1, m8, [pw_pixel_max_12] 849 CLIPW m2, m8, [pw_pixel_max_12] 850 CLIPW m3, m8, [pw_pixel_max_12] 851 CLIPW m4, m8, [pw_pixel_max_12] 852 CLIPW m5, m8, [pw_pixel_max_12] 853 CLIPW m6, m8, [pw_pixel_max_12] 854 movdqu [pix0q + strideq], m1; p2 855 movdqu [pix0q + 2 * strideq], m2; p1 856 movdqu [pix0q + src3strideq], m3; p0 857 movdqu [pixq ], m4; q0 858 movdqu [pixq + strideq], m5; q1 859 movdqu [pixq + 2 * strideq], m6; q2 860.bypassluma: 861 RET 862 863%endmacro 864 865INIT_XMM sse2 866LOOP_FILTER_LUMA 867INIT_XMM ssse3 868LOOP_FILTER_LUMA 869INIT_XMM avx 870LOOP_FILTER_LUMA 871%endif 872