1;***************************************************************************** 2;* SSE2-optimized HEVC deblocking code 3;***************************************************************************** 4;* Copyright (C) 2013 VTT 5;* 6;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi> 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION_RODATA 28 29pw_pixel_max_12: times 8 dw ((1 << 12)-1) 30pw_pixel_max_10: times 8 dw ((1 << 10)-1) 31pw_m2: times 8 dw -2 32pd_1 : times 4 dd 1 33 34cextern pw_4 35cextern pw_8 36cextern pw_m1 37 38SECTION .text 39INIT_XMM sse2 40 41; expands to [base],...,[base+7*stride] 42%define PASS8ROWS(base, base3, stride, stride3) \ 43 [base], [base+stride], [base+stride*2], [base3], \ 44 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] 45 46; in: 8 rows of 4 bytes in %4..%11 47; out: 4 rows of 8 words in m0..m3 48%macro TRANSPOSE4x8B_LOAD 8 49 movd m0, %1 50 movd m2, %2 51 movd m1, %3 52 movd m3, %4 53 54 punpcklbw m0, m2 55 punpcklbw m1, m3 56 punpcklwd m0, m1 57 58 movd m4, %5 59 movd m6, %6 60 movd m5, %7 61 movd m3, %8 62 63 punpcklbw m4, m6 64 punpcklbw m5, m3 65 punpcklwd m4, m5 66 67 punpckhdq m2, m0, m4 68 punpckldq m0, m4 69 70 pxor m5, m5 71 punpckhbw m1, m0, m5 72 punpcklbw m0, m5 73 punpckhbw m3, m2, m5 74 punpcklbw m2, m5 75%endmacro 76 77; in: 4 rows of 8 words in m0..m3 78; out: 8 rows of 4 bytes in %1..%8 79%macro TRANSPOSE8x4B_STORE 8 80 packuswb m0, m2 81 packuswb m1, m3 82 SBUTTERFLY bw, 0, 1, 2 83 SBUTTERFLY wd, 0, 1, 2 84 85 movd %1, m0 86 pshufd m0, m0, 0x39 87 movd %2, m0 88 pshufd m0, m0, 0x39 89 movd %3, m0 90 pshufd m0, m0, 0x39 91 movd %4, m0 92 93 movd %5, m1 94 pshufd m1, m1, 0x39 95 movd %6, m1 96 pshufd m1, m1, 0x39 97 movd %7, m1 98 pshufd m1, m1, 0x39 99 movd %8, m1 100%endmacro 101 102; in: 8 rows of 4 words in %4..%11 103; out: 4 rows of 8 words in m0..m3 104%macro TRANSPOSE4x8W_LOAD 8 105 movq m0, %1 106 movq m2, %2 107 movq m1, %3 108 movq m3, %4 109 110 punpcklwd m0, m2 111 punpcklwd m1, m3 112 punpckhdq m2, m0, m1 113 punpckldq m0, m1 114 115 movq m4, %5 116 movq m6, %6 117 movq m5, %7 118 movq m3, %8 119 120 punpcklwd m4, m6 121 punpcklwd m5, m3 122 punpckhdq m6, m4, m5 123 punpckldq m4, m5 124 125 punpckhqdq m1, m0, m4 126 punpcklqdq m0, m4 127 punpckhqdq m3, m2, m6 128 punpcklqdq m2, m6 129 130%endmacro 131 132; in: 4 rows of 8 words in m0..m3 133; out: 8 rows of 4 words in %1..%8 134%macro TRANSPOSE8x4W_STORE 9 135 TRANSPOSE4x4W 0, 1, 2, 3, 4 136 137 pxor m5, m5; zeros reg 138 CLIPW m0, m5, %9 139 CLIPW m1, m5, %9 140 CLIPW m2, m5, %9 141 CLIPW m3, m5, %9 142 143 movq %1, m0 144 movhps %2, m0 145 movq %3, m1 146 movhps %4, m1 147 movq %5, m2 148 movhps %6, m2 149 movq %7, m3 150 movhps %8, m3 151%endmacro 152 153; in: 8 rows of 8 bytes in %1..%8 154; out: 8 rows of 8 words in m0..m7 155%macro TRANSPOSE8x8B_LOAD 8 156 movq m7, %1 157 movq m2, %2 158 movq m1, %3 159 movq m3, %4 160 161 punpcklbw m7, m2 162 punpcklbw m1, m3 163 punpcklwd m3, m7, m1 164 punpckhwd m7, m1 165 166 movq m4, %5 167 movq m6, %6 168 movq m5, %7 169 movq m15, %8 170 171 punpcklbw m4, m6 172 punpcklbw m5, m15 173 punpcklwd m9, m4, m5 174 punpckhwd m4, m5 175 176 punpckldq m1, m3, m9; 0, 1 177 punpckhdq m3, m9; 2, 3 178 179 punpckldq m5, m7, m4; 4, 5 180 punpckhdq m7, m4; 6, 7 181 182 pxor m13, m13 183 184 punpcklbw m0, m1, m13; 0 in 16 bit 185 punpckhbw m1, m13; 1 in 16 bit 186 187 punpcklbw m2, m3, m13; 2 188 punpckhbw m3, m13; 3 189 190 punpcklbw m4, m5, m13; 4 191 punpckhbw m5, m13; 5 192 193 punpcklbw m6, m7, m13; 6 194 punpckhbw m7, m13; 7 195%endmacro 196 197 198; in: 8 rows of 8 words in m0..m8 199; out: 8 rows of 8 bytes in %1..%8 200%macro TRANSPOSE8x8B_STORE 8 201 packuswb m0, m4 202 packuswb m1, m5 203 packuswb m2, m6 204 packuswb m3, m7 205 TRANSPOSE2x4x4B 0, 1, 2, 3, 4 206 207 movq %1, m0 208 movhps %2, m0 209 movq %3, m1 210 movhps %4, m1 211 movq %5, m2 212 movhps %6, m2 213 movq %7, m3 214 movhps %8, m3 215%endmacro 216 217; in: 8 rows of 8 words in %1..%8 218; out: 8 rows of 8 words in m0..m7 219%macro TRANSPOSE8x8W_LOAD 8 220 movdqu m0, %1 221 movdqu m1, %2 222 movdqu m2, %3 223 movdqu m3, %4 224 movdqu m4, %5 225 movdqu m5, %6 226 movdqu m6, %7 227 movdqu m7, %8 228 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 229%endmacro 230 231; in: 8 rows of 8 words in m0..m8 232; out: 8 rows of 8 words in %1..%8 233%macro TRANSPOSE8x8W_STORE 9 234 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 235 236 pxor m8, m8 237 CLIPW m0, m8, %9 238 CLIPW m1, m8, %9 239 CLIPW m2, m8, %9 240 CLIPW m3, m8, %9 241 CLIPW m4, m8, %9 242 CLIPW m5, m8, %9 243 CLIPW m6, m8, %9 244 CLIPW m7, m8, %9 245 246 movdqu %1, m0 247 movdqu %2, m1 248 movdqu %3, m2 249 movdqu %4, m3 250 movdqu %5, m4 251 movdqu %6, m5 252 movdqu %7, m6 253 movdqu %8, m7 254%endmacro 255 256 257; in: %2 clobbered 258; out: %1 259; mask in m11 260; clobbers m10 261%macro MASKED_COPY 2 262 pand %2, m11 ; and mask 263 pandn m10, m11, %1; and -mask 264 por %2, m10 265 mova %1, %2 266%endmacro 267 268; in: %2 clobbered 269; out: %1 270; mask in %3, will be clobbered 271%macro MASKED_COPY2 3 272 pand %2, %3 ; and mask 273 pandn %3, %1; and -mask 274 por %2, %3 275 mova %1, %2 276%endmacro 277 278ALIGN 16 279; input in m0 ... m3 and tcs in r2. Output in m1 and m2 280%macro CHROMA_DEBLOCK_BODY 1 281 psubw m4, m2, m1; q0 - p0 282 psubw m5, m0, m3; p1 - q1 283 psllw m4, 2; << 2 284 paddw m5, m4; 285 286 ;tc calculations 287 movq m6, [tcq]; tc0 288 punpcklwd m6, m6 289 pshufd m6, m6, 0xA0; tc0, tc1 290%if cpuflag(ssse3) 291 psignw m4, m6, [pw_m1]; -tc0, -tc1 292%else 293 pmullw m4, m6, [pw_m1]; -tc0, -tc1 294%endif 295 ;end tc calculations 296 297 paddw m5, [pw_4]; +4 298 psraw m5, 3; >> 3 299 300%if %1 > 8 301 psllw m4, %1-8; << (BIT_DEPTH - 8) 302 psllw m6, %1-8; << (BIT_DEPTH - 8) 303%endif 304 pmaxsw m5, m4 305 pminsw m5, m6 306 paddw m1, m5; p0 + delta0 307 psubw m2, m5; q0 - delta0 308%endmacro 309 310; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6 311%macro LUMA_DEBLOCK_BODY 2 312 psllw m9, m2, 1; *2 313 psubw m10, m1, m9 314 paddw m10, m3 315 ABS1 m10, m11 ; 0dp0, 0dp3 , 1dp0, 1dp3 316 317 psllw m9, m5, 1; *2 318 psubw m11, m6, m9 319 paddw m11, m4 320 ABS1 m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3 321 322 ;beta calculations 323%if %1 > 8 324 shl betaq, %1 - 8 325%endif 326 movd m13, betad 327 SPLATW m13, m13, 0 328 ;end beta calculations 329 330 paddw m9, m10, m11; 0d0, 0d3 , 1d0, 1d3 331 332 pshufhw m14, m9, 0x0f ;0b00001111; 0d3 0d3 0d0 0d0 in high 333 pshuflw m14, m14, 0x0f ;0b00001111; 1d3 1d3 1d0 1d0 in low 334 335 pshufhw m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3 336 pshuflw m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3 337 338 paddw m14, m9; 0d0+0d3, 1d0+1d3 339 340 ;compare 341 pcmpgtw m15, m13, m14 342 movmskps r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1) 343 test r13, r13 344 je .bypassluma 345 346 ;weak / strong decision compare to beta_2 347 psraw m15, m13, 2; beta >> 2 348 psllw m8, m9, 1; 349 pcmpgtw m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2 350 movmskps r6, m15; 351 ;end weak / strong decision 352 353 ; weak filter nd_p/q calculation 354 pshufd m8, m10, 0x31 355 psrld m8, 16 356 paddw m8, m10 357 movd r7d, m8 358 pshufd m8, m8, 0x4E 359 movd r8d, m8 360 361 pshufd m8, m11, 0x31 362 psrld m8, 16 363 paddw m8, m11 364 movd r9d, m8 365 pshufd m8, m8, 0x4E 366 movd r10d, m8 367 ; end calc for weak filter 368 369 ; filtering mask 370 mov r11, r13 371 shr r11, 3 372 movd m15, r11d 373 and r13, 1 374 movd m11, r13d 375 shufps m11, m15, 0 376 shl r11, 1 377 or r13, r11 378 379 pcmpeqd m11, [pd_1]; filtering mask 380 381 ;decide between strong and weak filtering 382 ;tc25 calculations 383 mov r11d, [tcq]; 384%if %1 > 8 385 shl r11, %1 - 8 386%endif 387 movd m8, r11d; tc0 388 mov r3d, [tcq+4]; 389%if %1 > 8 390 shl r3, %1 - 8 391%endif 392 add r11d, r3d; tc0 + tc1 393 jz .bypassluma 394 movd m9, r3d; tc1 395 punpcklwd m8, m8 396 punpcklwd m9, m9 397 shufps m8, m9, 0; tc0, tc1 398 mova m9, m8 399 psllw m8, 2; tc << 2 400 pavgw m8, m9; tc25 = ((tc * 5 + 1) >> 1) 401 ;end tc25 calculations 402 403 ;----beta_3 comparison----- 404 psubw m12, m0, m3; p3 - p0 405 ABS1 m12, m14; abs(p3 - p0) 406 407 psubw m15, m7, m4; q3 - q0 408 ABS1 m15, m14; abs(q3 - q0) 409 410 paddw m12, m15; abs(p3 - p0) + abs(q3 - q0) 411 412 pshufhw m12, m12, 0xf0 ;0b11110000; 413 pshuflw m12, m12, 0xf0 ;0b11110000; 414 415 psraw m13, 3; beta >> 3 416 pcmpgtw m13, m12; 417 movmskps r11, m13; 418 and r6, r11; strong mask , beta_2 and beta_3 comparisons 419 ;----beta_3 comparison end----- 420 ;----tc25 comparison--- 421 psubw m12, m3, m4; p0 - q0 422 ABS1 m12, m14; abs(p0 - q0) 423 424 pshufhw m12, m12, 0xf0 ;0b11110000; 425 pshuflw m12, m12, 0xf0 ;0b11110000; 426 427 pcmpgtw m8, m12; tc25 comparisons 428 movmskps r11, m8; 429 and r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons 430 ;----tc25 comparison end--- 431 mov r11, r6; 432 shr r11, 1; 433 and r6, r11; strong mask, bits 2 and 0 434 435 pmullw m14, m9, [pw_m2]; -tc * 2 436 paddw m9, m9 437 438 and r6, 5; 0b101 439 mov r11, r6; strong mask 440 shr r6, 2; 441 movd m12, r6d; store to xmm for mask generation 442 shl r6, 1 443 and r11, 1 444 movd m10, r11d; store to xmm for mask generation 445 or r6, r11; final strong mask, bits 1 and 0 446 jz .weakfilter 447 448 shufps m10, m12, 0 449 pcmpeqd m10, [pd_1]; strong mask 450 451 mova m13, [pw_4]; 4 in every cell 452 pand m11, m10; combine filtering mask and strong mask 453 paddw m12, m2, m3; p1 + p0 454 paddw m12, m4; p1 + p0 + q0 455 mova m10, m12; copy 456 paddw m12, m12; 2*p1 + 2*p0 + 2*q0 457 paddw m12, m1; p2 + 2*p1 + 2*p0 + 2*q0 458 paddw m12, m5; p2 + 2*p1 + 2*p0 + 2*q0 + q1 459 paddw m12, m13; p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 460 psraw m12, 3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) 461 psubw m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0 462 pmaxsw m12, m14 463 pminsw m12, m9; av_clip( , -2 * tc, 2 * tc) 464 paddw m12, m3; p0' 465 466 paddw m15, m1, m10; p2 + p1 + p0 + q0 467 psrlw m13, 1; 2 in every cell 468 paddw m15, m13; p2 + p1 + p0 + q0 + 2 469 psraw m15, 2; (p2 + p1 + p0 + q0 + 2) >> 2 470 psubw m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1 471 pmaxsw m15, m14 472 pminsw m15, m9; av_clip( , -2 * tc, 2 * tc) 473 paddw m15, m2; p1' 474 475 paddw m8, m1, m0; p3 + p2 476 paddw m8, m8; 2*p3 + 2*p2 477 paddw m8, m1; 2*p3 + 3*p2 478 paddw m8, m10; 2*p3 + 3*p2 + p1 + p0 + q0 479 paddw m13, m13 480 paddw m8, m13; 2*p3 + 3*p2 + p1 + p0 + q0 + 4 481 psraw m8, 3; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3 482 psubw m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2 483 pmaxsw m8, m14 484 pminsw m8, m9; av_clip( , -2 * tc, 2 * tc) 485 paddw m8, m1; p2' 486 MASKED_COPY m1, m8 487 488 paddw m8, m3, m4; p0 + q0 489 paddw m8, m5; p0 + q0 + q1 490 paddw m8, m8; 2*p0 + 2*q0 + 2*q1 491 paddw m8, m2; p1 + 2*p0 + 2*q0 + 2*q1 492 paddw m8, m6; p1 + 2*p0 + 2*q0 + 2*q1 + q2 493 paddw m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 494 psraw m8, 3; (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3 495 psubw m8, m4; 496 pmaxsw m8, m14 497 pminsw m8, m9; av_clip( , -2 * tc, 2 * tc) 498 paddw m8, m4; q0' 499 MASKED_COPY m2, m15 500 501 paddw m15, m3, m4; p0 + q0 502 paddw m15, m5; p0 + q0 + q1 503 mova m10, m15; 504 paddw m15, m6; p0 + q0 + q1 + q2 505 psrlw m13, 1; 2 in every cell 506 paddw m15, m13; p0 + q0 + q1 + q2 + 2 507 psraw m15, 2; (p0 + q0 + q1 + q2 + 2) >> 2 508 psubw m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1 509 pmaxsw m15, m14 510 pminsw m15, m9; av_clip( , -2 * tc, 2 * tc) 511 paddw m15, m5; q1' 512 513 paddw m13, m7; q3 + 2 514 paddw m13, m6; q3 + q2 + 2 515 paddw m13, m13; 2*q3 + 2*q2 + 4 516 paddw m13, m6; 2*q3 + 3*q2 + 4 517 paddw m13, m10; 2*q3 + 3*q2 + q1 + q0 + p0 + 4 518 psraw m13, 3; (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3 519 psubw m13, m6; ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2 520 pmaxsw m13, m14 521 pminsw m13, m9; av_clip( , -2 * tc, 2 * tc) 522 paddw m13, m6; q2' 523 524 MASKED_COPY m6, m13 525 MASKED_COPY m5, m15 526 MASKED_COPY m4, m8 527 MASKED_COPY m3, m12 528 529.weakfilter: 530 not r6; strong mask -> weak mask 531 and r6, r13; final weak filtering mask, bits 0 and 1 532 jz .store 533 534 ; weak filtering mask 535 mov r11, r6 536 shr r11, 1 537 movd m12, r11d 538 and r6, 1 539 movd m11, r6d 540 shufps m11, m12, 0 541 pcmpeqd m11, [pd_1]; filtering mask 542 543 mov r13, betaq 544 shr r13, 1; 545 add betaq, r13 546 shr betaq, 3; ((beta + (beta >> 1)) >> 3)) 547 548 mova m13, [pw_8] 549 psubw m12, m4, m3 ; q0 - p0 550 psllw m10, m12, 3; 8 * (q0 - p0) 551 paddw m12, m10 ; 9 * (q0 - p0) 552 553 psubw m10, m5, m2 ; q1 - p1 554 psllw m8, m10, 1; 2 * ( q1 - p1 ) 555 paddw m10, m8; 3 * ( q1 - p1 ) 556 psubw m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 ) 557 paddw m12, m13; + 8 558 psraw m12, 4; >> 4 , delta0 559 PABSW m13, m12; abs(delta0) 560 561 562 psllw m10, m9, 2; 8 * tc 563 paddw m10, m9; 10 * tc 564 pcmpgtw m10, m13 565 pand m11, m10 566 567 psraw m9, 1; tc * 2 -> tc 568 psraw m14, 1; -tc * 2 -> -tc 569 570 pmaxsw m12, m14 571 pminsw m12, m9; av_clip(delta0, -tc, tc) 572 573 psraw m9, 1; tc -> tc / 2 574%if cpuflag(ssse3) 575 psignw m14, m9, [pw_m1]; -tc / 2 576%else 577 pmullw m14, m9, [pw_m1]; -tc / 2 578%endif 579 580 pavgw m15, m1, m3; (p2 + p0 + 1) >> 1 581 psubw m15, m2; ((p2 + p0 + 1) >> 1) - p1 582 paddw m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0 583 psraw m15, 1; (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1 584 pmaxsw m15, m14 585 pminsw m15, m9; av_clip(deltap1, -tc/2, tc/2) 586 paddw m15, m2; p1' 587 588 ;beta calculations 589 movd m10, betad 590 SPLATW m10, m10, 0 591 592 movd m13, r7d; 1dp0 + 1dp3 593 movd m8, r8d; 0dp0 + 0dp3 594 punpcklwd m8, m8 595 punpcklwd m13, m13 596 shufps m13, m8, 0; 597 pcmpgtw m8, m10, m13 598 pand m8, m11 599 ;end beta calculations 600 MASKED_COPY2 m2, m15, m8; write p1' 601 602 pavgw m8, m6, m4; (q2 + q0 + 1) >> 1 603 psubw m8, m5; ((q2 + q0 + 1) >> 1) - q1 604 psubw m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0) 605 psraw m8, 1; ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1 606 pmaxsw m8, m14 607 pminsw m8, m9; av_clip(deltaq1, -tc/2, tc/2) 608 paddw m8, m5; q1' 609 610 movd m13, r9d; 611 movd m15, r10d; 612 punpcklwd m15, m15 613 punpcklwd m13, m13 614 shufps m13, m15, 0; dq0 + dq3 615 616 pcmpgtw m10, m13; compare to ((beta+(beta>>1))>>3) 617 pand m10, m11 618 MASKED_COPY2 m5, m8, m10; write q1' 619 620 paddw m15, m3, m12 ; p0 + delta0 621 MASKED_COPY m3, m15 622 623 psubw m8, m4, m12 ; q0 - delta0 624 MASKED_COPY m4, m8 625%endmacro 626 627;----------------------------------------------------------------------------- 628; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc, 629; uint8_t *_no_p, uint8_t *_no_q); 630;----------------------------------------------------------------------------- 631%macro LOOP_FILTER_CHROMA 0 632cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride 633 sub pixq, 2 634 lea r3strideq, [3*strideq] 635 mov pix0q, pixq 636 add pixq, r3strideq 637 TRANSPOSE4x8B_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq) 638 CHROMA_DEBLOCK_BODY 8 639 TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq) 640 RET 641 642cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride 643 sub pixq, 4 644 lea r3strideq, [3*strideq] 645 mov pix0q, pixq 646 add pixq, r3strideq 647 TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq) 648 CHROMA_DEBLOCK_BODY 10 649 TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10] 650 RET 651 652cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride 653 sub pixq, 4 654 lea r3strideq, [3*strideq] 655 mov pix0q, pixq 656 add pixq, r3strideq 657 TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq) 658 CHROMA_DEBLOCK_BODY 12 659 TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12] 660 RET 661 662;----------------------------------------------------------------------------- 663; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc, 664; uint8_t *_no_p, uint8_t *_no_q); 665;----------------------------------------------------------------------------- 666cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0 667 mov pix0q, pixq 668 sub pix0q, strideq 669 sub pix0q, strideq 670 movq m0, [pix0q]; p1 671 movq m1, [pix0q+strideq]; p0 672 movq m2, [pixq]; q0 673 movq m3, [pixq+strideq]; q1 674 pxor m5, m5; zeros reg 675 punpcklbw m0, m5 676 punpcklbw m1, m5 677 punpcklbw m2, m5 678 punpcklbw m3, m5 679 CHROMA_DEBLOCK_BODY 8 680 packuswb m1, m2 681 movh[pix0q+strideq], m1 682 movhps [pixq], m1 683 RET 684 685cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0 686 mov pix0q, pixq 687 sub pix0q, strideq 688 sub pix0q, strideq 689 movu m0, [pix0q]; p1 690 movu m1, [pix0q+strideq]; p0 691 movu m2, [pixq]; q0 692 movu m3, [pixq+strideq]; q1 693 CHROMA_DEBLOCK_BODY 10 694 pxor m5, m5; zeros reg 695 CLIPW m1, m5, [pw_pixel_max_10] 696 CLIPW m2, m5, [pw_pixel_max_10] 697 movu [pix0q+strideq], m1 698 movu [pixq], m2 699 RET 700 701cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0 702 mov pix0q, pixq 703 sub pix0q, strideq 704 sub pix0q, strideq 705 movu m0, [pix0q]; p1 706 movu m1, [pix0q+strideq]; p0 707 movu m2, [pixq]; q0 708 movu m3, [pixq+strideq]; q1 709 CHROMA_DEBLOCK_BODY 12 710 pxor m5, m5; zeros reg 711 CLIPW m1, m5, [pw_pixel_max_12] 712 CLIPW m2, m5, [pw_pixel_max_12] 713 movu [pix0q+strideq], m1 714 movu [pixq], m2 715 RET 716%endmacro 717 718INIT_XMM sse2 719LOOP_FILTER_CHROMA 720INIT_XMM avx 721LOOP_FILTER_CHROMA 722 723%if ARCH_X86_64 724%macro LOOP_FILTER_LUMA 0 725;----------------------------------------------------------------------------- 726; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta, 727; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q); 728;----------------------------------------------------------------------------- 729cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 730 sub pixq, 4 731 lea pix0q, [3 * r1] 732 mov src3strideq, pixq 733 add pixq, pix0q 734 TRANSPOSE8x8B_LOAD PASS8ROWS(src3strideq, pixq, r1, pix0q) 735 LUMA_DEBLOCK_BODY 8, v 736.store: 737 TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q) 738.bypassluma: 739 RET 740 741cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 742 sub pixq, 8 743 lea pix0q, [3 * strideq] 744 mov src3strideq, pixq 745 add pixq, pix0q 746 TRANSPOSE8x8W_LOAD PASS8ROWS(src3strideq, pixq, strideq, pix0q) 747 LUMA_DEBLOCK_BODY 10, v 748.store: 749 TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10] 750.bypassluma: 751 RET 752 753cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 754 sub pixq, 8 755 lea pix0q, [3 * strideq] 756 mov src3strideq, pixq 757 add pixq, pix0q 758 TRANSPOSE8x8W_LOAD PASS8ROWS(src3strideq, pixq, strideq, pix0q) 759 LUMA_DEBLOCK_BODY 12, v 760.store: 761 TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12] 762.bypassluma: 763 RET 764 765;----------------------------------------------------------------------------- 766; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta, 767; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q); 768;----------------------------------------------------------------------------- 769cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 770 lea src3strideq, [3 * strideq] 771 mov pix0q, pixq 772 sub pix0q, src3strideq 773 sub pix0q, strideq 774 movq m0, [pix0q]; p3 775 movq m1, [pix0q + strideq]; p2 776 movq m2, [pix0q + 2 * strideq]; p1 777 movq m3, [pix0q + src3strideq]; p0 778 movq m4, [pixq]; q0 779 movq m5, [pixq + strideq]; q1 780 movq m6, [pixq + 2 * strideq]; q2 781 movq m7, [pixq + src3strideq]; q3 782 pxor m8, m8 783 punpcklbw m0, m8 784 punpcklbw m1, m8 785 punpcklbw m2, m8 786 punpcklbw m3, m8 787 punpcklbw m4, m8 788 punpcklbw m5, m8 789 punpcklbw m6, m8 790 punpcklbw m7, m8 791 LUMA_DEBLOCK_BODY 8, h 792.store: 793 packuswb m1, m2 794 packuswb m3, m4 795 packuswb m5, m6 796 movh [pix0q + strideq], m1 797 movhps [pix0q + 2 * strideq], m1 798 movh [pix0q + src3strideq], m3 799 movhps [pixq ], m3 800 movh [pixq + strideq], m5 801 movhps [pixq + 2 * strideq], m5 802.bypassluma: 803 RET 804 805cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 806 lea src3strideq, [3 * strideq] 807 mov pix0q, pixq 808 sub pix0q, src3strideq 809 sub pix0q, strideq 810 movdqu m0, [pix0q]; p3 811 movdqu m1, [pix0q + strideq]; p2 812 movdqu m2, [pix0q + 2 * strideq]; p1 813 movdqu m3, [pix0q + src3strideq]; p0 814 movdqu m4, [pixq]; q0 815 movdqu m5, [pixq + strideq]; q1 816 movdqu m6, [pixq + 2 * strideq]; q2 817 movdqu m7, [pixq + src3strideq]; q3 818 LUMA_DEBLOCK_BODY 10, h 819.store: 820 pxor m8, m8; zeros reg 821 CLIPW m1, m8, [pw_pixel_max_10] 822 CLIPW m2, m8, [pw_pixel_max_10] 823 CLIPW m3, m8, [pw_pixel_max_10] 824 CLIPW m4, m8, [pw_pixel_max_10] 825 CLIPW m5, m8, [pw_pixel_max_10] 826 CLIPW m6, m8, [pw_pixel_max_10] 827 movdqu [pix0q + strideq], m1; p2 828 movdqu [pix0q + 2 * strideq], m2; p1 829 movdqu [pix0q + src3strideq], m3; p0 830 movdqu [pixq ], m4; q0 831 movdqu [pixq + strideq], m5; q1 832 movdqu [pixq + 2 * strideq], m6; q2 833.bypassluma: 834 RET 835 836cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 837 lea src3strideq, [3 * strideq] 838 mov pix0q, pixq 839 sub pix0q, src3strideq 840 sub pix0q, strideq 841 movdqu m0, [pix0q]; p3 842 movdqu m1, [pix0q + strideq]; p2 843 movdqu m2, [pix0q + 2 * strideq]; p1 844 movdqu m3, [pix0q + src3strideq]; p0 845 movdqu m4, [pixq]; q0 846 movdqu m5, [pixq + strideq]; q1 847 movdqu m6, [pixq + 2 * strideq]; q2 848 movdqu m7, [pixq + src3strideq]; q3 849 LUMA_DEBLOCK_BODY 12, h 850.store: 851 pxor m8, m8; zeros reg 852 CLIPW m1, m8, [pw_pixel_max_12] 853 CLIPW m2, m8, [pw_pixel_max_12] 854 CLIPW m3, m8, [pw_pixel_max_12] 855 CLIPW m4, m8, [pw_pixel_max_12] 856 CLIPW m5, m8, [pw_pixel_max_12] 857 CLIPW m6, m8, [pw_pixel_max_12] 858 movdqu [pix0q + strideq], m1; p2 859 movdqu [pix0q + 2 * strideq], m2; p1 860 movdqu [pix0q + src3strideq], m3; p0 861 movdqu [pixq ], m4; q0 862 movdqu [pixq + strideq], m5; q1 863 movdqu [pixq + 2 * strideq], m6; q2 864.bypassluma: 865 RET 866 867%endmacro 868 869INIT_XMM sse2 870LOOP_FILTER_LUMA 871INIT_XMM ssse3 872LOOP_FILTER_LUMA 873INIT_XMM avx 874LOOP_FILTER_LUMA 875%endif 876