1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%macro VERTx4 1 15 mov rdx, arg(5) ;filter ptr 16 mov rsi, arg(0) ;src_ptr 17 mov rdi, arg(2) ;output_ptr 18 mov rcx, 0x0400040 19 20 movdqa xmm4, [rdx] ;load filters 21 movq xmm5, rcx 22 packsswb xmm4, xmm4 23 pshuflw xmm0, xmm4, 0b ;k0_k1 24 pshuflw xmm1, xmm4, 01010101b ;k2_k3 25 pshuflw xmm2, xmm4, 10101010b ;k4_k5 26 pshuflw xmm3, xmm4, 11111111b ;k6_k7 27 28 punpcklqdq xmm0, xmm0 29 punpcklqdq xmm1, xmm1 30 punpcklqdq xmm2, xmm2 31 punpcklqdq xmm3, xmm3 32 33 movdqa k0k1, xmm0 34 movdqa k2k3, xmm1 35 pshufd xmm5, xmm5, 0 36 movdqa k4k5, xmm2 37 movdqa k6k7, xmm3 38 movdqa krd, xmm5 39 40 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 41 42%if ABI_IS_32BIT=0 43 movsxd r8, DWORD PTR arg(3) ;out_pitch 44%endif 45 mov rax, rsi 46 movsxd rcx, DWORD PTR arg(4) ;output_height 47 add rax, rdx 48 49 lea rbx, [rdx + rdx*4] 50 add rbx, rdx ;pitch * 6 51 52.loop: 53 movd xmm0, [rsi] ;A 54 movd xmm1, [rsi + rdx] ;B 55 movd xmm2, [rsi + rdx * 2] ;C 56 movd xmm3, [rax + rdx * 2] ;D 57 movd xmm4, [rsi + rdx * 4] ;E 58 movd xmm5, [rax + rdx * 4] ;F 59 60 punpcklbw xmm0, xmm1 ;A B 61 punpcklbw xmm2, xmm3 ;C D 62 punpcklbw xmm4, xmm5 ;E F 63 64 movd xmm6, [rsi + rbx] ;G 65 movd xmm7, [rax + rbx] ;H 66 67 pmaddubsw xmm0, k0k1 68 pmaddubsw xmm2, k2k3 69 punpcklbw xmm6, xmm7 ;G H 70 pmaddubsw xmm4, k4k5 71 pmaddubsw xmm6, k6k7 72 73 movdqa xmm1, xmm2 74 paddsw xmm0, xmm6 75 pmaxsw xmm2, xmm4 76 pminsw xmm4, xmm1 77 paddsw xmm0, xmm4 78 paddsw xmm0, xmm2 79 80 paddsw xmm0, krd 81 psraw xmm0, 7 82 packuswb xmm0, xmm0 83 84 add rsi, rdx 85 add rax, rdx 86%if %1 87 movd xmm1, [rdi] 88 pavgb xmm0, xmm1 89%endif 90 movd [rdi], xmm0 91 92%if ABI_IS_32BIT 93 add rdi, DWORD PTR arg(3) ;out_pitch 94%else 95 add rdi, r8 96%endif 97 dec rcx 98 jnz .loop 99%endm 100 101%macro VERTx8 1 102 mov rdx, arg(5) ;filter ptr 103 mov rsi, arg(0) ;src_ptr 104 mov rdi, arg(2) ;output_ptr 105 mov rcx, 0x0400040 106 107 movdqa xmm4, [rdx] ;load filters 108 movq xmm5, rcx 109 packsswb xmm4, xmm4 110 pshuflw xmm0, xmm4, 0b ;k0_k1 111 pshuflw xmm1, xmm4, 01010101b ;k2_k3 112 pshuflw xmm2, xmm4, 10101010b ;k4_k5 113 pshuflw xmm3, xmm4, 11111111b ;k6_k7 114 115 punpcklqdq xmm0, xmm0 116 punpcklqdq xmm1, xmm1 117 punpcklqdq xmm2, xmm2 118 punpcklqdq xmm3, xmm3 119 120 movdqa k0k1, xmm0 121 movdqa k2k3, xmm1 122 pshufd xmm5, xmm5, 0 123 movdqa k4k5, xmm2 124 movdqa k6k7, xmm3 125 movdqa krd, xmm5 126 127 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 128 129%if ABI_IS_32BIT=0 130 movsxd r8, DWORD PTR arg(3) ;out_pitch 131%endif 132 mov rax, rsi 133 movsxd rcx, DWORD PTR arg(4) ;output_height 134 add rax, rdx 135 136 lea rbx, [rdx + rdx*4] 137 add rbx, rdx ;pitch * 6 138 139.loop: 140 movq xmm0, [rsi] ;A 141 movq xmm1, [rsi + rdx] ;B 142 movq xmm2, [rsi + rdx * 2] ;C 143 movq xmm3, [rax + rdx * 2] ;D 144 movq xmm4, [rsi + rdx * 4] ;E 145 movq xmm5, [rax + rdx * 4] ;F 146 147 punpcklbw xmm0, xmm1 ;A B 148 punpcklbw xmm2, xmm3 ;C D 149 punpcklbw xmm4, xmm5 ;E F 150 151 movq xmm6, [rsi + rbx] ;G 152 movq xmm7, [rax + rbx] ;H 153 154 pmaddubsw xmm0, k0k1 155 pmaddubsw xmm2, k2k3 156 punpcklbw xmm6, xmm7 ;G H 157 pmaddubsw xmm4, k4k5 158 pmaddubsw xmm6, k6k7 159 160 paddsw xmm0, xmm6 161 movdqa xmm1, xmm2 162 pmaxsw xmm2, xmm4 163 pminsw xmm4, xmm1 164 paddsw xmm0, xmm4 165 paddsw xmm0, xmm2 166 167 paddsw xmm0, krd 168 psraw xmm0, 7 169 packuswb xmm0, xmm0 170 171 add rsi, rdx 172 add rax, rdx 173%if %1 174 movq xmm1, [rdi] 175 pavgb xmm0, xmm1 176%endif 177 movq [rdi], xmm0 178 179%if ABI_IS_32BIT 180 add rdi, DWORD PTR arg(3) ;out_pitch 181%else 182 add rdi, r8 183%endif 184 dec rcx 185 jnz .loop 186%endm 187 188 189%macro VERTx16 1 190 mov rdx, arg(5) ;filter ptr 191 mov rsi, arg(0) ;src_ptr 192 mov rdi, arg(2) ;output_ptr 193 mov rcx, 0x0400040 194 195 movdqa xmm4, [rdx] ;load filters 196 movq xmm5, rcx 197 packsswb xmm4, xmm4 198 pshuflw xmm0, xmm4, 0b ;k0_k1 199 pshuflw xmm1, xmm4, 01010101b ;k2_k3 200 pshuflw xmm2, xmm4, 10101010b ;k4_k5 201 pshuflw xmm3, xmm4, 11111111b ;k6_k7 202 203 punpcklqdq xmm0, xmm0 204 punpcklqdq xmm1, xmm1 205 punpcklqdq xmm2, xmm2 206 punpcklqdq xmm3, xmm3 207 208 movdqa k0k1, xmm0 209 movdqa k2k3, xmm1 210 pshufd xmm5, xmm5, 0 211 movdqa k4k5, xmm2 212 movdqa k6k7, xmm3 213 movdqa krd, xmm5 214 215 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 216 217%if ABI_IS_32BIT=0 218 movsxd r8, DWORD PTR arg(3) ;out_pitch 219%endif 220 mov rax, rsi 221 movsxd rcx, DWORD PTR arg(4) ;output_height 222 add rax, rdx 223 224 lea rbx, [rdx + rdx*4] 225 add rbx, rdx ;pitch * 6 226 227.loop: 228 movq xmm0, [rsi] ;A 229 movq xmm1, [rsi + rdx] ;B 230 movq xmm2, [rsi + rdx * 2] ;C 231 movq xmm3, [rax + rdx * 2] ;D 232 movq xmm4, [rsi + rdx * 4] ;E 233 movq xmm5, [rax + rdx * 4] ;F 234 235 punpcklbw xmm0, xmm1 ;A B 236 punpcklbw xmm2, xmm3 ;C D 237 punpcklbw xmm4, xmm5 ;E F 238 239 movq xmm6, [rsi + rbx] ;G 240 movq xmm7, [rax + rbx] ;H 241 242 pmaddubsw xmm0, k0k1 243 pmaddubsw xmm2, k2k3 244 punpcklbw xmm6, xmm7 ;G H 245 pmaddubsw xmm4, k4k5 246 pmaddubsw xmm6, k6k7 247 248 paddsw xmm0, xmm6 249 movdqa xmm1, xmm2 250 pmaxsw xmm2, xmm4 251 pminsw xmm4, xmm1 252 paddsw xmm0, xmm4 253 paddsw xmm0, xmm2 254 255 paddsw xmm0, krd 256 psraw xmm0, 7 257 packuswb xmm0, xmm0 258%if %1 259 movq xmm1, [rdi] 260 pavgb xmm0, xmm1 261%endif 262 movq [rdi], xmm0 263 264 movq xmm0, [rsi + 8] ;A 265 movq xmm1, [rsi + rdx + 8] ;B 266 movq xmm2, [rsi + rdx * 2 + 8] ;C 267 movq xmm3, [rax + rdx * 2 + 8] ;D 268 movq xmm4, [rsi + rdx * 4 + 8] ;E 269 movq xmm5, [rax + rdx * 4 + 8] ;F 270 271 punpcklbw xmm0, xmm1 ;A B 272 punpcklbw xmm2, xmm3 ;C D 273 punpcklbw xmm4, xmm5 ;E F 274 275 movq xmm6, [rsi + rbx + 8] ;G 276 movq xmm7, [rax + rbx + 8] ;H 277 punpcklbw xmm6, xmm7 ;G H 278 279 pmaddubsw xmm0, k0k1 280 pmaddubsw xmm2, k2k3 281 pmaddubsw xmm4, k4k5 282 pmaddubsw xmm6, k6k7 283 284 paddsw xmm0, xmm6 285 movdqa xmm1, xmm2 286 pmaxsw xmm2, xmm4 287 pminsw xmm4, xmm1 288 paddsw xmm0, xmm4 289 paddsw xmm0, xmm2 290 291 paddsw xmm0, krd 292 psraw xmm0, 7 293 packuswb xmm0, xmm0 294 295 add rsi, rdx 296 add rax, rdx 297%if %1 298 movq xmm1, [rdi+8] 299 pavgb xmm0, xmm1 300%endif 301 302 movq [rdi+8], xmm0 303 304%if ABI_IS_32BIT 305 add rdi, DWORD PTR arg(3) ;out_pitch 306%else 307 add rdi, r8 308%endif 309 dec rcx 310 jnz .loop 311%endm 312 313;void vp9_filter_block1d8_v8_ssse3 314;( 315; unsigned char *src_ptr, 316; unsigned int src_pitch, 317; unsigned char *output_ptr, 318; unsigned int out_pitch, 319; unsigned int output_height, 320; short *filter 321;) 322global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE 323sym(vp9_filter_block1d4_v8_ssse3): 324 push rbp 325 mov rbp, rsp 326 SHADOW_ARGS_TO_STACK 6 327 SAVE_XMM 7 328 push rsi 329 push rdi 330 push rbx 331 ; end prolog 332 333 ALIGN_STACK 16, rax 334 sub rsp, 16*5 335 %define k0k1 [rsp + 16*0] 336 %define k2k3 [rsp + 16*1] 337 %define k4k5 [rsp + 16*2] 338 %define k6k7 [rsp + 16*3] 339 %define krd [rsp + 16*4] 340 341 VERTx4 0 342 343 add rsp, 16*5 344 pop rsp 345 pop rbx 346 ; begin epilog 347 pop rdi 348 pop rsi 349 RESTORE_XMM 350 UNSHADOW_ARGS 351 pop rbp 352 ret 353 354;void vp9_filter_block1d8_v8_ssse3 355;( 356; unsigned char *src_ptr, 357; unsigned int src_pitch, 358; unsigned char *output_ptr, 359; unsigned int out_pitch, 360; unsigned int output_height, 361; short *filter 362;) 363global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE 364sym(vp9_filter_block1d8_v8_ssse3): 365 push rbp 366 mov rbp, rsp 367 SHADOW_ARGS_TO_STACK 6 368 SAVE_XMM 7 369 push rsi 370 push rdi 371 push rbx 372 ; end prolog 373 374 ALIGN_STACK 16, rax 375 sub rsp, 16*5 376 %define k0k1 [rsp + 16*0] 377 %define k2k3 [rsp + 16*1] 378 %define k4k5 [rsp + 16*2] 379 %define k6k7 [rsp + 16*3] 380 %define krd [rsp + 16*4] 381 382 VERTx8 0 383 384 add rsp, 16*5 385 pop rsp 386 pop rbx 387 ; begin epilog 388 pop rdi 389 pop rsi 390 RESTORE_XMM 391 UNSHADOW_ARGS 392 pop rbp 393 ret 394 395;void vp9_filter_block1d16_v8_ssse3 396;( 397; unsigned char *src_ptr, 398; unsigned int src_pitch, 399; unsigned char *output_ptr, 400; unsigned int out_pitch, 401; unsigned int output_height, 402; short *filter 403;) 404global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE 405sym(vp9_filter_block1d16_v8_ssse3): 406 push rbp 407 mov rbp, rsp 408 SHADOW_ARGS_TO_STACK 6 409 SAVE_XMM 7 410 push rsi 411 push rdi 412 push rbx 413 ; end prolog 414 415 ALIGN_STACK 16, rax 416 sub rsp, 16*5 417 %define k0k1 [rsp + 16*0] 418 %define k2k3 [rsp + 16*1] 419 %define k4k5 [rsp + 16*2] 420 %define k6k7 [rsp + 16*3] 421 %define krd [rsp + 16*4] 422 423 VERTx16 0 424 425 add rsp, 16*5 426 pop rsp 427 pop rbx 428 ; begin epilog 429 pop rdi 430 pop rsi 431 RESTORE_XMM 432 UNSHADOW_ARGS 433 pop rbp 434 ret 435 436;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 437 438 439global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE 440sym(vp9_filter_block1d4_v8_avg_ssse3): 441 push rbp 442 mov rbp, rsp 443 SHADOW_ARGS_TO_STACK 6 444 SAVE_XMM 7 445 push rsi 446 push rdi 447 push rbx 448 ; end prolog 449 450 ALIGN_STACK 16, rax 451 sub rsp, 16*5 452 %define k0k1 [rsp + 16*0] 453 %define k2k3 [rsp + 16*1] 454 %define k4k5 [rsp + 16*2] 455 %define k6k7 [rsp + 16*3] 456 %define krd [rsp + 16*4] 457 458 VERTx4 1 459 460 add rsp, 16*5 461 pop rsp 462 pop rbx 463 ; begin epilog 464 pop rdi 465 pop rsi 466 RESTORE_XMM 467 UNSHADOW_ARGS 468 pop rbp 469 ret 470 471global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE 472sym(vp9_filter_block1d8_v8_avg_ssse3): 473 push rbp 474 mov rbp, rsp 475 SHADOW_ARGS_TO_STACK 6 476 SAVE_XMM 7 477 push rsi 478 push rdi 479 push rbx 480 ; end prolog 481 482 ALIGN_STACK 16, rax 483 sub rsp, 16*5 484 %define k0k1 [rsp + 16*0] 485 %define k2k3 [rsp + 16*1] 486 %define k4k5 [rsp + 16*2] 487 %define k6k7 [rsp + 16*3] 488 %define krd [rsp + 16*4] 489 490 VERTx8 1 491 492 add rsp, 16*5 493 pop rsp 494 pop rbx 495 ; begin epilog 496 pop rdi 497 pop rsi 498 RESTORE_XMM 499 UNSHADOW_ARGS 500 pop rbp 501 ret 502 503global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE 504sym(vp9_filter_block1d16_v8_avg_ssse3): 505 push rbp 506 mov rbp, rsp 507 SHADOW_ARGS_TO_STACK 6 508 SAVE_XMM 7 509 push rsi 510 push rdi 511 push rbx 512 ; end prolog 513 514 ALIGN_STACK 16, rax 515 sub rsp, 16*5 516 %define k0k1 [rsp + 16*0] 517 %define k2k3 [rsp + 16*1] 518 %define k4k5 [rsp + 16*2] 519 %define k6k7 [rsp + 16*3] 520 %define krd [rsp + 16*4] 521 522 VERTx16 1 523 524 add rsp, 16*5 525 pop rsp 526 pop rbx 527 ; begin epilog 528 pop rdi 529 pop rsi 530 RESTORE_XMM 531 UNSHADOW_ARGS 532 pop rbp 533 ret 534 535;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 536%macro HORIZx4_ROW 2 537 movdqa %2, %1 538 pshufb %1, [GLOBAL(shuf_t0t1)] 539 pshufb %2, [GLOBAL(shuf_t2t3)] 540 pmaddubsw %1, k0k1k4k5 541 pmaddubsw %2, k2k3k6k7 542 543 movdqa xmm4, %1 544 movdqa xmm5, %2 545 psrldq %1, 8 546 psrldq %2, 8 547 movdqa xmm6, xmm5 548 549 paddsw xmm4, %2 550 pmaxsw xmm5, %1 551 pminsw %1, xmm6 552 paddsw %1, xmm4 553 paddsw %1, xmm5 554 555 paddsw %1, krd 556 psraw %1, 7 557 packuswb %1, %1 558%endm 559 560%macro HORIZx4 1 561 mov rdx, arg(5) ;filter ptr 562 mov rsi, arg(0) ;src_ptr 563 mov rdi, arg(2) ;output_ptr 564 mov rcx, 0x0400040 565 566 movdqa xmm4, [rdx] ;load filters 567 movq xmm5, rcx 568 packsswb xmm4, xmm4 569 pshuflw xmm6, xmm4, 0b ;k0_k1 570 pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5 571 pshuflw xmm7, xmm4, 01010101b ;k2_k3 572 pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 573 pshufd xmm5, xmm5, 0 ;rounding 574 575 movdqa k0k1k4k5, xmm6 576 movdqa k2k3k6k7, xmm7 577 movdqa krd, xmm5 578 579 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 580 movsxd rdx, dword ptr arg(3) ;output_pitch 581 movsxd rcx, dword ptr arg(4) ;output_height 582 shr rcx, 1 583.loop: 584 ;Do two rows once 585 movq xmm0, [rsi - 3] ;load src 586 movq xmm1, [rsi + 5] 587 movq xmm2, [rsi + rax - 3] 588 movq xmm3, [rsi + rax + 5] 589 punpcklqdq xmm0, xmm1 590 punpcklqdq xmm2, xmm3 591 592 HORIZx4_ROW xmm0, xmm1 593 HORIZx4_ROW xmm2, xmm3 594%if %1 595 movd xmm1, [rdi] 596 pavgb xmm0, xmm1 597 movd xmm3, [rdi + rdx] 598 pavgb xmm2, xmm3 599%endif 600 movd [rdi], xmm0 601 movd [rdi +rdx], xmm2 602 603 lea rsi, [rsi + rax] 604 prefetcht0 [rsi + 4 * rax - 3] 605 lea rsi, [rsi + rax] 606 lea rdi, [rdi + 2 * rdx] 607 prefetcht0 [rsi + 2 * rax - 3] 608 609 dec rcx 610 jnz .loop 611 612 ; Do last row if output_height is odd 613 movsxd rcx, dword ptr arg(4) ;output_height 614 and rcx, 1 615 je .done 616 617 movq xmm0, [rsi - 3] ; load src 618 movq xmm1, [rsi + 5] 619 punpcklqdq xmm0, xmm1 620 621 HORIZx4_ROW xmm0, xmm1 622%if %1 623 movd xmm1, [rdi] 624 pavgb xmm0, xmm1 625%endif 626 movd [rdi], xmm0 627.done 628%endm 629 630%macro HORIZx8_ROW 4 631 movdqa %2, %1 632 movdqa %3, %1 633 movdqa %4, %1 634 635 pshufb %1, [GLOBAL(shuf_t0t1)] 636 pshufb %2, [GLOBAL(shuf_t2t3)] 637 pshufb %3, [GLOBAL(shuf_t4t5)] 638 pshufb %4, [GLOBAL(shuf_t6t7)] 639 640 pmaddubsw %1, k0k1 641 pmaddubsw %2, k2k3 642 pmaddubsw %3, k4k5 643 pmaddubsw %4, k6k7 644 645 paddsw %1, %4 646 movdqa %4, %2 647 pmaxsw %2, %3 648 pminsw %3, %4 649 paddsw %1, %3 650 paddsw %1, %2 651 652 paddsw %1, krd 653 psraw %1, 7 654 packuswb %1, %1 655%endm 656 657%macro HORIZx8 1 658 mov rdx, arg(5) ;filter ptr 659 mov rsi, arg(0) ;src_ptr 660 mov rdi, arg(2) ;output_ptr 661 mov rcx, 0x0400040 662 663 movdqa xmm4, [rdx] ;load filters 664 movq xmm5, rcx 665 packsswb xmm4, xmm4 666 pshuflw xmm0, xmm4, 0b ;k0_k1 667 pshuflw xmm1, xmm4, 01010101b ;k2_k3 668 pshuflw xmm2, xmm4, 10101010b ;k4_k5 669 pshuflw xmm3, xmm4, 11111111b ;k6_k7 670 671 punpcklqdq xmm0, xmm0 672 punpcklqdq xmm1, xmm1 673 punpcklqdq xmm2, xmm2 674 punpcklqdq xmm3, xmm3 675 676 movdqa k0k1, xmm0 677 movdqa k2k3, xmm1 678 pshufd xmm5, xmm5, 0 679 movdqa k4k5, xmm2 680 movdqa k6k7, xmm3 681 movdqa krd, xmm5 682 683 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 684 movsxd rdx, dword ptr arg(3) ;output_pitch 685 movsxd rcx, dword ptr arg(4) ;output_height 686 shr rcx, 1 687 688.loop: 689 movq xmm0, [rsi - 3] ;load src 690 movq xmm3, [rsi + 5] 691 movq xmm4, [rsi + rax - 3] 692 movq xmm7, [rsi + rax + 5] 693 punpcklqdq xmm0, xmm3 694 punpcklqdq xmm4, xmm7 695 696 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 697 HORIZx8_ROW xmm4, xmm5, xmm6, xmm7 698%if %1 699 movq xmm1, [rdi] 700 movq xmm2, [rdi + rdx] 701 pavgb xmm0, xmm1 702 pavgb xmm4, xmm2 703%endif 704 movq [rdi], xmm0 705 movq [rdi + rdx], xmm4 706 707 lea rsi, [rsi + rax] 708 prefetcht0 [rsi + 4 * rax - 3] 709 lea rsi, [rsi + rax] 710 lea rdi, [rdi + 2 * rdx] 711 prefetcht0 [rsi + 2 * rax - 3] 712 dec rcx 713 jnz .loop 714 715 ;Do last row if output_height is odd 716 movsxd rcx, dword ptr arg(4) ;output_height 717 and rcx, 1 718 je .done 719 720 movq xmm0, [rsi - 3] 721 movq xmm3, [rsi + 5] 722 punpcklqdq xmm0, xmm3 723 724 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 725%if %1 726 movq xmm1, [rdi] 727 pavgb xmm0, xmm1 728%endif 729 movq [rdi], xmm0 730.done 731%endm 732 733%macro HORIZx16 1 734 mov rdx, arg(5) ;filter ptr 735 mov rsi, arg(0) ;src_ptr 736 mov rdi, arg(2) ;output_ptr 737 mov rcx, 0x0400040 738 739 movdqa xmm4, [rdx] ;load filters 740 movq xmm5, rcx 741 packsswb xmm4, xmm4 742 pshuflw xmm0, xmm4, 0b ;k0_k1 743 pshuflw xmm1, xmm4, 01010101b ;k2_k3 744 pshuflw xmm2, xmm4, 10101010b ;k4_k5 745 pshuflw xmm3, xmm4, 11111111b ;k6_k7 746 747 punpcklqdq xmm0, xmm0 748 punpcklqdq xmm1, xmm1 749 punpcklqdq xmm2, xmm2 750 punpcklqdq xmm3, xmm3 751 752 movdqa k0k1, xmm0 753 movdqa k2k3, xmm1 754 pshufd xmm5, xmm5, 0 755 movdqa k4k5, xmm2 756 movdqa k6k7, xmm3 757 movdqa krd, xmm5 758 759 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 760 movsxd rdx, dword ptr arg(3) ;output_pitch 761 movsxd rcx, dword ptr arg(4) ;output_height 762 763.loop: 764 prefetcht0 [rsi + 2 * rax -3] 765 766 movq xmm0, [rsi - 3] ;load src data 767 movq xmm4, [rsi + 5] 768 movq xmm6, [rsi + 13] 769 punpcklqdq xmm0, xmm4 770 punpcklqdq xmm4, xmm6 771 772 movdqa xmm7, xmm0 773 774 punpcklbw xmm7, xmm7 775 punpckhbw xmm0, xmm0 776 movdqa xmm1, xmm0 777 movdqa xmm2, xmm0 778 movdqa xmm3, xmm0 779 780 palignr xmm0, xmm7, 1 781 palignr xmm1, xmm7, 5 782 pmaddubsw xmm0, k0k1 783 palignr xmm2, xmm7, 9 784 pmaddubsw xmm1, k2k3 785 palignr xmm3, xmm7, 13 786 787 pmaddubsw xmm2, k4k5 788 pmaddubsw xmm3, k6k7 789 paddsw xmm0, xmm3 790 791 movdqa xmm3, xmm4 792 punpcklbw xmm3, xmm3 793 punpckhbw xmm4, xmm4 794 795 movdqa xmm5, xmm4 796 movdqa xmm6, xmm4 797 movdqa xmm7, xmm4 798 799 palignr xmm4, xmm3, 1 800 palignr xmm5, xmm3, 5 801 palignr xmm6, xmm3, 9 802 palignr xmm7, xmm3, 13 803 804 movdqa xmm3, xmm1 805 pmaddubsw xmm4, k0k1 806 pmaxsw xmm1, xmm2 807 pmaddubsw xmm5, k2k3 808 pminsw xmm2, xmm3 809 pmaddubsw xmm6, k4k5 810 paddsw xmm0, xmm2 811 pmaddubsw xmm7, k6k7 812 paddsw xmm0, xmm1 813 814 paddsw xmm4, xmm7 815 movdqa xmm7, xmm5 816 pmaxsw xmm5, xmm6 817 pminsw xmm6, xmm7 818 paddsw xmm4, xmm6 819 paddsw xmm4, xmm5 820 821 paddsw xmm0, krd 822 paddsw xmm4, krd 823 psraw xmm0, 7 824 psraw xmm4, 7 825 packuswb xmm0, xmm0 826 packuswb xmm4, xmm4 827 punpcklqdq xmm0, xmm4 828%if %1 829 movdqa xmm1, [rdi] 830 pavgb xmm0, xmm1 831%endif 832 833 lea rsi, [rsi + rax] 834 movdqa [rdi], xmm0 835 836 lea rdi, [rdi + rdx] 837 dec rcx 838 jnz .loop 839%endm 840 841;void vp9_filter_block1d4_h8_ssse3 842;( 843; unsigned char *src_ptr, 844; unsigned int src_pixels_per_line, 845; unsigned char *output_ptr, 846; unsigned int output_pitch, 847; unsigned int output_height, 848; short *filter 849;) 850global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE 851sym(vp9_filter_block1d4_h8_ssse3): 852 push rbp 853 mov rbp, rsp 854 SHADOW_ARGS_TO_STACK 6 855 SAVE_XMM 7 856 GET_GOT rbx 857 push rsi 858 push rdi 859 ; end prolog 860 861 ALIGN_STACK 16, rax 862 sub rsp, 16 * 3 863 %define k0k1k4k5 [rsp + 16 * 0] 864 %define k2k3k6k7 [rsp + 16 * 1] 865 %define krd [rsp + 16 * 2] 866 867 HORIZx4 0 868 869 add rsp, 16 * 3 870 pop rsp 871 ; begin epilog 872 pop rdi 873 pop rsi 874 RESTORE_GOT 875 RESTORE_XMM 876 UNSHADOW_ARGS 877 pop rbp 878 ret 879 880;void vp9_filter_block1d8_h8_ssse3 881;( 882; unsigned char *src_ptr, 883; unsigned int src_pixels_per_line, 884; unsigned char *output_ptr, 885; unsigned int output_pitch, 886; unsigned int output_height, 887; short *filter 888;) 889global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE 890sym(vp9_filter_block1d8_h8_ssse3): 891 push rbp 892 mov rbp, rsp 893 SHADOW_ARGS_TO_STACK 6 894 SAVE_XMM 7 895 GET_GOT rbx 896 push rsi 897 push rdi 898 ; end prolog 899 900 ALIGN_STACK 16, rax 901 sub rsp, 16*5 902 %define k0k1 [rsp + 16*0] 903 %define k2k3 [rsp + 16*1] 904 %define k4k5 [rsp + 16*2] 905 %define k6k7 [rsp + 16*3] 906 %define krd [rsp + 16*4] 907 908 HORIZx8 0 909 910 add rsp, 16*5 911 pop rsp 912 913 ; begin epilog 914 pop rdi 915 pop rsi 916 RESTORE_GOT 917 RESTORE_XMM 918 UNSHADOW_ARGS 919 pop rbp 920 ret 921 922;void vp9_filter_block1d16_h8_ssse3 923;( 924; unsigned char *src_ptr, 925; unsigned int src_pixels_per_line, 926; unsigned char *output_ptr, 927; unsigned int output_pitch, 928; unsigned int output_height, 929; short *filter 930;) 931global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE 932sym(vp9_filter_block1d16_h8_ssse3): 933 push rbp 934 mov rbp, rsp 935 SHADOW_ARGS_TO_STACK 6 936 SAVE_XMM 7 937 GET_GOT rbx 938 push rsi 939 push rdi 940 ; end prolog 941 942 ALIGN_STACK 16, rax 943 sub rsp, 16*5 944 %define k0k1 [rsp + 16*0] 945 %define k2k3 [rsp + 16*1] 946 %define k4k5 [rsp + 16*2] 947 %define k6k7 [rsp + 16*3] 948 %define krd [rsp + 16*4] 949 950 HORIZx16 0 951 952 add rsp, 16*5 953 pop rsp 954 955 ; begin epilog 956 pop rdi 957 pop rsi 958 RESTORE_GOT 959 RESTORE_XMM 960 UNSHADOW_ARGS 961 pop rbp 962 ret 963 964global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE 965sym(vp9_filter_block1d4_h8_avg_ssse3): 966 push rbp 967 mov rbp, rsp 968 SHADOW_ARGS_TO_STACK 6 969 SAVE_XMM 7 970 GET_GOT rbx 971 push rsi 972 push rdi 973 ; end prolog 974 975 ALIGN_STACK 16, rax 976 sub rsp, 16 * 3 977 %define k0k1k4k5 [rsp + 16 * 0] 978 %define k2k3k6k7 [rsp + 16 * 1] 979 %define krd [rsp + 16 * 2] 980 981 HORIZx4 1 982 983 add rsp, 16 * 3 984 pop rsp 985 ; begin epilog 986 pop rdi 987 pop rsi 988 RESTORE_GOT 989 RESTORE_XMM 990 UNSHADOW_ARGS 991 pop rbp 992 ret 993 994global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE 995sym(vp9_filter_block1d8_h8_avg_ssse3): 996 push rbp 997 mov rbp, rsp 998 SHADOW_ARGS_TO_STACK 6 999 SAVE_XMM 7 1000 GET_GOT rbx 1001 push rsi 1002 push rdi 1003 ; end prolog 1004 1005 ALIGN_STACK 16, rax 1006 sub rsp, 16*5 1007 %define k0k1 [rsp + 16*0] 1008 %define k2k3 [rsp + 16*1] 1009 %define k4k5 [rsp + 16*2] 1010 %define k6k7 [rsp + 16*3] 1011 %define krd [rsp + 16*4] 1012 1013 HORIZx8 1 1014 1015 add rsp, 16*5 1016 pop rsp 1017 1018 ; begin epilog 1019 pop rdi 1020 pop rsi 1021 RESTORE_GOT 1022 RESTORE_XMM 1023 UNSHADOW_ARGS 1024 pop rbp 1025 ret 1026 1027global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE 1028sym(vp9_filter_block1d16_h8_avg_ssse3): 1029 push rbp 1030 mov rbp, rsp 1031 SHADOW_ARGS_TO_STACK 6 1032 SAVE_XMM 7 1033 GET_GOT rbx 1034 push rsi 1035 push rdi 1036 ; end prolog 1037 1038 ALIGN_STACK 16, rax 1039 sub rsp, 16*5 1040 %define k0k1 [rsp + 16*0] 1041 %define k2k3 [rsp + 16*1] 1042 %define k4k5 [rsp + 16*2] 1043 %define k6k7 [rsp + 16*3] 1044 %define krd [rsp + 16*4] 1045 1046 HORIZx16 1 1047 1048 add rsp, 16*5 1049 pop rsp 1050 1051 ; begin epilog 1052 pop rdi 1053 pop rsi 1054 RESTORE_GOT 1055 RESTORE_XMM 1056 UNSHADOW_ARGS 1057 pop rbp 1058 ret 1059SECTION_RODATA 1060align 16 1061shuf_t0t1: 1062 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 1063align 16 1064shuf_t2t3: 1065 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 1066align 16 1067shuf_t4t5: 1068 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 1069align 16 1070shuf_t6t7: 1071 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 1072