1;****************************************************************************** 2;* x86-optimized functions for the CFHD decoder 3;* Copyright (c) 2020 Paul B Mahol 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 25 26factor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1, 27factor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1, 28factor_p11_n4: dw 11, -4, 11, -4, 11, -4, 11, -4, 29factor_p5_p4: dw 5, 4, 5, 4, 5, 4, 5, 4, 30pd_4: times 4 dd 4 31pw_1: times 8 dw 1 32pw_0: times 8 dw 0 33pw_1023: times 8 dw 1023 34pw_4095: times 8 dw 4095 35 36SECTION .text 37 38%macro CFHD_HORIZ_FILTER 1 39%if %1 == 1023 40cglobal cfhd_horiz_filter_clip10, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp 41 shl widthd, 1 42%define ostrideq widthq 43%define lwidthq widthq 44%define hwidthq widthq 45%elif %1 == 4095 46cglobal cfhd_horiz_filter_clip12, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp 47 shl widthd, 1 48%define ostrideq widthq 49%define lwidthq widthq 50%define hwidthq widthq 51%else 52%if ARCH_X86_64 53cglobal cfhd_horiz_filter, 8, 11, 12, output, ostride, low, lwidth, high, hwidth, width, height, x, y, temp 54 shl ostrided, 1 55 shl lwidthd, 1 56 shl hwidthd, 1 57 shl widthd, 1 58 59 mov yd, heightd 60 neg yq 61%else 62cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height 63 shl xd, 1 64 shl yd, 1 65 shl tempd, 1 66 shl widthd, 1 67 68 mov xmp, xq 69 mov ymp, yq 70 mov tempmp, tempq 71 72 mov yd, r7m 73 neg yq 74 75%define ostrideq xm 76%define lwidthq ym 77%define hwidthq tempm 78%endif 79%endif 80 81%if ARCH_X86_64 82 mova m8, [factor_p1_n1] 83 mova m9, [factor_n1_p1] 84 mova m10, [pw_1] 85 mova m11, [pd_4] 86%endif 87 88%if %1 == 0 89.looph: 90%endif 91 movsx xq, word [lowq] 92 imul xq, 11 93 94 movsx tempq, word [lowq + 2] 95 imul tempq, -4 96 add tempq, xq 97 98 movsx xq, word [lowq + 4] 99 add tempq, xq 100 add tempq, 4 101 sar tempq, 3 102 103 movsx xq, word [highq] 104 add tempq, xq 105 sar tempq, 1 106 107%if %1 108 movd xm0, tempd 109 CLIPW m0, [pw_0], [pw_%1] 110 pextrw tempd, xm0, 0 111%endif 112 mov word [outputq], tempw 113 114 movsx xq, word [lowq] 115 imul xq, 5 116 117 movsx tempq, word [lowq + 2] 118 imul tempq, 4 119 add tempq, xq 120 121 movsx xq, word [lowq + 4] 122 sub tempq, xq 123 add tempq, 4 124 sar tempq, 3 125 126 movsx xq, word [highq] 127 sub tempq, xq 128 sar tempq, 1 129 130%if %1 131 movd xm0, tempd 132 CLIPW m0, [pw_0], [pw_%1] 133 pextrw tempd, xm0, 0 134%endif 135 mov word [outputq + 2], tempw 136 137 mov xq, 0 138 139.loop: 140 movu m4, [lowq + xq] 141 movu m1, [lowq + xq + 4] 142 143 mova m5, m4 144 punpcklwd m4, m1 145 punpckhwd m5, m1 146 147 mova m6, m4 148 mova m7, m5 149 150%if ARCH_X86_64 151 pmaddwd m4, m8 152 pmaddwd m5, m8 153 pmaddwd m6, m9 154 pmaddwd m7, m9 155 156 paddd m4, m11 157 paddd m5, m11 158 paddd m6, m11 159 paddd m7, m11 160%else 161 pmaddwd m4, [factor_p1_n1] 162 pmaddwd m5, [factor_p1_n1] 163 pmaddwd m6, [factor_n1_p1] 164 pmaddwd m7, [factor_n1_p1] 165 166 paddd m4, [pd_4] 167 paddd m5, [pd_4] 168 paddd m6, [pd_4] 169 paddd m7, [pd_4] 170%endif 171 172 psrad m4, 3 173 psrad m5, 3 174 psrad m6, 3 175 psrad m7, 3 176 177 movu m2, [lowq + xq + 2] 178 movu m3, [highq + xq + 2] 179 180 mova m0, m2 181 punpcklwd m2, m3 182 punpckhwd m0, m3 183 184 mova m1, m2 185 mova m3, m0 186 187%if ARCH_X86_64 188 pmaddwd m2, m10 189 pmaddwd m0, m10 190 pmaddwd m1, m8 191 pmaddwd m3, m8 192%else 193 pmaddwd m2, [pw_1] 194 pmaddwd m0, [pw_1] 195 pmaddwd m1, [factor_p1_n1] 196 pmaddwd m3, [factor_p1_n1] 197%endif 198 199 paddd m2, m4 200 paddd m0, m5 201 paddd m1, m6 202 paddd m3, m7 203 204 psrad m2, 1 205 psrad m0, 1 206 psrad m1, 1 207 psrad m3, 1 208 209 packssdw m2, m0 210 packssdw m1, m3 211 212 mova m0, m2 213 punpcklwd m2, m1 214 punpckhwd m0, m1 215 216%if %1 217 CLIPW m2, [pw_0], [pw_%1] 218 CLIPW m0, [pw_0], [pw_%1] 219%endif 220 221 movu [outputq + xq * 2 + 4], m2 222 movu [outputq + xq * 2 + mmsize + 4], m0 223 224 add xq, mmsize 225 cmp xq, widthq 226 jl .loop 227 228 add lowq, widthq 229 add highq, widthq 230 add outputq, widthq 231 add outputq, widthq 232 233 movsx xq, word [lowq - 2] 234 imul xq, 5 235 236 movsx tempq, word [lowq - 4] 237 imul tempq, 4 238 add tempq, xq 239 240 movsx xq, word [lowq - 6] 241 sub tempq, xq 242 add tempq, 4 243 sar tempq, 3 244 245 movsx xq, word [highq - 2] 246 add tempq, xq 247 sar tempq, 1 248 249%if %1 250 movd xm0, tempd 251 CLIPW m0, [pw_0], [pw_%1] 252 pextrw tempd, xm0, 0 253%endif 254 mov word [outputq - 4], tempw 255 256 movsx xq, word [lowq - 2] 257 imul xq, 11 258 259 movsx tempq, word [lowq - 4] 260 imul tempq, -4 261 add tempq, xq 262 263 movsx xq, word [lowq - 6] 264 add tempq, xq 265 add tempq, 4 266 sar tempq, 3 267 268 movsx xq, word [highq - 2] 269 sub tempq, xq 270 sar tempq, 1 271 272%if %1 273 movd xm0, tempd 274 CLIPW m0, [pw_0], [pw_%1] 275 pextrw tempd, xm0, 0 276%endif 277 mov word [outputq - 2], tempw 278 279%if %1 == 0 280 sub lowq, widthq 281 sub highq, widthq 282 sub outputq, widthq 283 sub outputq, widthq 284 285 add lowq, lwidthq 286 add highq, hwidthq 287 add outputq, ostrideq 288 add outputq, ostrideq 289 add yq, 1 290 jl .looph 291%endif 292 293 RET 294%endmacro 295 296INIT_XMM sse2 297CFHD_HORIZ_FILTER 0 298 299INIT_XMM sse2 300CFHD_HORIZ_FILTER 1023 301 302INIT_XMM sse2 303CFHD_HORIZ_FILTER 4095 304 305INIT_XMM sse2 306%if ARCH_X86_64 307cglobal cfhd_vert_filter, 8, 11, 14, output, ostride, low, lwidth, high, hwidth, width, height, x, y, pos 308 shl ostrided, 1 309 shl lwidthd, 1 310 shl hwidthd, 1 311 shl widthd, 1 312 313 dec heightd 314 315 mova m8, [factor_p1_n1] 316 mova m9, [factor_n1_p1] 317 mova m10, [pw_1] 318 mova m11, [pd_4] 319 mova m12, [factor_p11_n4] 320 mova m13, [factor_p5_p4] 321%else 322cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height 323 shl xd, 1 324 shl yd, 1 325 shl posd, 1 326 shl widthd, 1 327 328 mov xmp, xq 329 mov ymp, yq 330 mov posmp, posq 331 332 mov xq, r7m 333 dec xq 334 mov widthmp, xq 335 336%define ostrideq xm 337%define lwidthq ym 338%define hwidthq posm 339%define heightq widthm 340 341%endif 342 343 xor xq, xq 344.loopw: 345 xor yq, yq 346 347 mov posq, xq 348 movu m0, [lowq + posq] 349 add posq, lwidthq 350 movu m1, [lowq + posq] 351 mova m2, m0 352 punpcklwd m0, m1 353 punpckhwd m2, m1 354 355%if ARCH_X86_64 356 pmaddwd m0, m12 357 pmaddwd m2, m12 358%else 359 pmaddwd m0, [factor_p11_n4] 360 pmaddwd m2, [factor_p11_n4] 361%endif 362 363 pxor m4, m4 364 add posq, lwidthq 365 movu m1, [lowq + posq] 366 mova m3, m4 367 punpcklwd m4, m1 368 punpckhwd m3, m1 369 370 psrad m4, 16 371 psrad m3, 16 372 373 paddd m0, m4 374 paddd m2, m3 375 376 paddd m0, [pd_4] 377 paddd m2, [pd_4] 378 379 psrad m0, 3 380 psrad m2, 3 381 382 mov posq, xq 383 pxor m4, m4 384 movu m1, [highq + posq] 385 mova m3, m4 386 punpcklwd m4, m1 387 punpckhwd m3, m1 388 389 psrad m4, 16 390 psrad m3, 16 391 392 paddd m0, m4 393 paddd m2, m3 394 395 psrad m0, 1 396 psrad m2, 1 397 398 packssdw m0, m2 399 400 movu [outputq + posq], m0 401 402 movu m0, [lowq + posq] 403 add posq, lwidthq 404 movu m1, [lowq + posq] 405 mova m2, m0 406 punpcklwd m0, m1 407 punpckhwd m2, m1 408 409%if ARCH_X86_64 410 pmaddwd m0, m13 411 pmaddwd m2, m13 412%else 413 pmaddwd m0, [factor_p5_p4] 414 pmaddwd m2, [factor_p5_p4] 415%endif 416 417 pxor m4, m4 418 add posq, lwidthq 419 movu m1, [lowq + posq] 420 mova m3, m4 421 punpcklwd m4, m1 422 punpckhwd m3, m1 423 424 psrad m4, 16 425 psrad m3, 16 426 427 psubd m0, m4 428 psubd m2, m3 429 430 paddd m0, [pd_4] 431 paddd m2, [pd_4] 432 433 psrad m0, 3 434 psrad m2, 3 435 436 mov posq, xq 437 pxor m4, m4 438 movu m1, [highq + posq] 439 mova m3, m4 440 punpcklwd m4, m1 441 punpckhwd m3, m1 442 443 psrad m4, 16 444 psrad m3, 16 445 446 psubd m0, m4 447 psubd m2, m3 448 449 psrad m0, 1 450 psrad m2, 1 451 452 packssdw m0, m2 453 454 add posq, ostrideq 455 movu [outputq + posq], m0 456 457 add yq, 1 458.looph: 459 mov posq, lwidthq 460 imul posq, yq 461 sub posq, lwidthq 462 add posq, xq 463 464 movu m4, [lowq + posq] 465 466 add posq, lwidthq 467 add posq, lwidthq 468 movu m1, [lowq + posq] 469 470 mova m5, m4 471 punpcklwd m4, m1 472 punpckhwd m5, m1 473 474 mova m6, m4 475 mova m7, m5 476 477%if ARCH_X86_64 478 pmaddwd m4, m8 479 pmaddwd m5, m8 480 pmaddwd m6, m9 481 pmaddwd m7, m9 482 483 paddd m4, m11 484 paddd m5, m11 485 paddd m6, m11 486 paddd m7, m11 487%else 488 pmaddwd m4, [factor_p1_n1] 489 pmaddwd m5, [factor_p1_n1] 490 pmaddwd m6, [factor_n1_p1] 491 pmaddwd m7, [factor_n1_p1] 492 493 paddd m4, [pd_4] 494 paddd m5, [pd_4] 495 paddd m6, [pd_4] 496 paddd m7, [pd_4] 497%endif 498 499 psrad m4, 3 500 psrad m5, 3 501 psrad m6, 3 502 psrad m7, 3 503 504 sub posq, lwidthq 505 movu m0, [lowq + posq] 506 507 mov posq, hwidthq 508 imul posq, yq 509 add posq, xq 510 movu m1, [highq + posq] 511 512 mova m2, m0 513 punpcklwd m0, m1 514 punpckhwd m2, m1 515 516 mova m1, m0 517 mova m3, m2 518 519%if ARCH_X86_64 520 pmaddwd m0, m10 521 pmaddwd m2, m10 522 pmaddwd m1, m8 523 pmaddwd m3, m8 524%else 525 pmaddwd m0, [pw_1] 526 pmaddwd m2, [pw_1] 527 pmaddwd m1, [factor_p1_n1] 528 pmaddwd m3, [factor_p1_n1] 529%endif 530 531 paddd m0, m4 532 paddd m2, m5 533 paddd m1, m6 534 paddd m3, m7 535 536 psrad m0, 1 537 psrad m2, 1 538 psrad m1, 1 539 psrad m3, 1 540 541 packssdw m0, m2 542 packssdw m1, m3 543 544 mov posq, ostrideq 545 imul posq, 2 546 imul posq, yq 547 add posq, xq 548 549 movu [outputq + posq], m0 550 add posq, ostrideq 551 movu [outputq + posq], m1 552 553 add yq, 1 554 cmp yq, heightq 555 jl .looph 556 557 mov posq, lwidthq 558 imul posq, yq 559 add posq, xq 560 movu m0, [lowq + posq] 561 sub posq, lwidthq 562 movu m1, [lowq + posq] 563 mova m2, m0 564 punpcklwd m0, m1 565 punpckhwd m2, m1 566 567%if ARCH_X86_64 568 pmaddwd m0, m13 569 pmaddwd m2, m13 570%else 571 pmaddwd m0, [factor_p5_p4] 572 pmaddwd m2, [factor_p5_p4] 573%endif 574 575 pxor m4, m4 576 sub posq, lwidthq 577 movu m1, [lowq + posq] 578 mova m3, m4 579 punpcklwd m4, m1 580 punpckhwd m3, m1 581 582 psrad m4, 16 583 psrad m3, 16 584 585 psubd m0, m4 586 psubd m2, m3 587 588%if ARCH_X86_64 589 paddd m0, m11 590 paddd m2, m11 591%else 592 paddd m0, [pd_4] 593 paddd m2, [pd_4] 594%endif 595 596 psrad m0, 3 597 psrad m2, 3 598 599 mov posq, hwidthq 600 imul posq, yq 601 add posq, xq 602 pxor m4, m4 603 movu m1, [highq + posq] 604 mova m3, m4 605 punpcklwd m4, m1 606 punpckhwd m3, m1 607 608 psrad m4, 16 609 psrad m3, 16 610 611 paddd m0, m4 612 paddd m2, m3 613 614 psrad m0, 1 615 psrad m2, 1 616 617 packssdw m0, m2 618 619 mov posq, ostrideq 620 imul posq, 2 621 imul posq, yq 622 add posq, xq 623 movu [outputq + posq], m0 624 625 mov posq, lwidthq 626 imul posq, yq 627 add posq, xq 628 movu m0, [lowq + posq] 629 sub posq, lwidthq 630 movu m1, [lowq + posq] 631 mova m2, m0 632 punpcklwd m0, m1 633 punpckhwd m2, m1 634 635%if ARCH_X86_64 636 pmaddwd m0, m12 637 pmaddwd m2, m12 638%else 639 pmaddwd m0, [factor_p11_n4] 640 pmaddwd m2, [factor_p11_n4] 641%endif 642 643 pxor m4, m4 644 sub posq, lwidthq 645 movu m1, [lowq + posq] 646 mova m3, m4 647 punpcklwd m4, m1 648 punpckhwd m3, m1 649 650 psrad m4, 16 651 psrad m3, 16 652 653 paddd m0, m4 654 paddd m2, m3 655 656%if ARCH_X86_64 657 paddd m0, m11 658 paddd m2, m11 659%else 660 paddd m0, [pd_4] 661 paddd m2, [pd_4] 662%endif 663 664 psrad m0, 3 665 psrad m2, 3 666 667 mov posq, hwidthq 668 imul posq, yq 669 add posq, xq 670 pxor m4, m4 671 movu m1, [highq + posq] 672 mova m3, m4 673 punpcklwd m4, m1 674 punpckhwd m3, m1 675 676 psrad m4, 16 677 psrad m3, 16 678 679 psubd m0, m4 680 psubd m2, m3 681 682 psrad m0, 1 683 psrad m2, 1 684 685 packssdw m0, m2 686 687 mov posq, ostrideq 688 imul posq, 2 689 imul posq, yq 690 add posq, ostrideq 691 add posq, xq 692 movu [outputq + posq], m0 693 694 add xq, mmsize 695 cmp xq, widthq 696 jl .loopw 697 RET 698