1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14 15%include "aom_ports/x86_abi_support.asm" 16 17;Note: tap3 and tap4 have to be applied and added after other taps to avoid 18;overflow. 19 20%macro HIGH_GET_FILTERS_4 0 21 mov rdx, arg(5) ;filter ptr 22 mov rcx, 0x00000040 23 24 movdqa xmm7, [rdx] ;load filters 25 pshuflw xmm0, xmm7, 0b ;k0 26 pshuflw xmm1, xmm7, 01010101b ;k1 27 pshuflw xmm2, xmm7, 10101010b ;k2 28 pshuflw xmm3, xmm7, 11111111b ;k3 29 psrldq xmm7, 8 30 pshuflw xmm4, xmm7, 0b ;k4 31 pshuflw xmm5, xmm7, 01010101b ;k5 32 pshuflw xmm6, xmm7, 10101010b ;k6 33 pshuflw xmm7, xmm7, 11111111b ;k7 34 35 punpcklwd xmm0, xmm6 36 punpcklwd xmm2, xmm5 37 punpcklwd xmm3, xmm4 38 punpcklwd xmm1, xmm7 39 40 movdqa k0k6, xmm0 41 movdqa k2k5, xmm2 42 movdqa k3k4, xmm3 43 movdqa k1k7, xmm1 44 45 movq xmm6, rcx 46 pshufd xmm6, xmm6, 0 47 movdqa krd, xmm6 48 49 ;Compute max and min values of a pixel 50 mov rdx, 0x00010001 51 movsxd rcx, DWORD PTR arg(6) ;bps 52 movq xmm0, rdx 53 movq xmm1, rcx 54 pshufd xmm0, xmm0, 0b 55 movdqa xmm2, xmm0 56 psllw xmm0, xmm1 57 psubw xmm0, xmm2 58 pxor xmm1, xmm1 59 movdqa max, xmm0 ;max value (for clamping) 60 movdqa min, xmm1 ;min value (for clamping) 61 62%endm 63 64%macro HIGH_APPLY_FILTER_4 1 65 punpcklwd xmm0, xmm6 ;two row in one register 66 punpcklwd xmm1, xmm7 67 punpcklwd xmm2, xmm5 68 punpcklwd xmm3, xmm4 69 70 pmaddwd xmm0, k0k6 ;multiply the filter factors 71 pmaddwd xmm1, k1k7 72 pmaddwd xmm2, k2k5 73 pmaddwd xmm3, k3k4 74 75 paddd xmm0, xmm1 ;sum 76 paddd xmm0, xmm2 77 paddd xmm0, xmm3 78 79 paddd xmm0, krd ;rounding 80 psrad xmm0, 7 ;shift 81 packssdw xmm0, xmm0 ;pack to word 82 83 ;clamp the values 84 pminsw xmm0, max 85 pmaxsw xmm0, min 86 87%if %1 88 movq xmm1, [rdi] 89 pavgw xmm0, xmm1 90%endif 91 movq [rdi], xmm0 92%endm 93 94%macro HIGH_GET_FILTERS 0 95 mov rdx, arg(5) ;filter ptr 96 mov rsi, arg(0) ;src_ptr 97 mov rdi, arg(2) ;output_ptr 98 mov rcx, 0x00000040 99 100 movdqa xmm7, [rdx] ;load filters 101 pshuflw xmm0, xmm7, 0b ;k0 102 pshuflw xmm1, xmm7, 01010101b ;k1 103 pshuflw xmm2, xmm7, 10101010b ;k2 104 pshuflw xmm3, xmm7, 11111111b ;k3 105 pshufhw xmm4, xmm7, 0b ;k4 106 pshufhw xmm5, xmm7, 01010101b ;k5 107 pshufhw xmm6, xmm7, 10101010b ;k6 108 pshufhw xmm7, xmm7, 11111111b ;k7 109 punpcklqdq xmm2, xmm2 110 punpcklqdq xmm3, xmm3 111 punpcklwd xmm0, xmm1 112 punpckhwd xmm6, xmm7 113 punpckhwd xmm2, xmm5 114 punpckhwd xmm3, xmm4 115 116 movdqa k0k1, xmm0 ;store filter factors on stack 117 movdqa k6k7, xmm6 118 movdqa k2k5, xmm2 119 movdqa k3k4, xmm3 120 121 movq xmm6, rcx 122 pshufd xmm6, xmm6, 0 123 movdqa krd, xmm6 ;rounding 124 125 ;Compute max and min values of a pixel 126 mov rdx, 0x00010001 127 movsxd rcx, DWORD PTR arg(6) ;bps 128 movq xmm0, rdx 129 movq xmm1, rcx 130 pshufd xmm0, xmm0, 0b 131 movdqa xmm2, xmm0 132 psllw xmm0, xmm1 133 psubw xmm0, xmm2 134 pxor xmm1, xmm1 135 movdqa max, xmm0 ;max value (for clamping) 136 movdqa min, xmm1 ;min value (for clamping) 137%endm 138 139%macro LOAD_VERT_8 1 140 movdqu xmm0, [rsi + %1] ;0 141 movdqu xmm1, [rsi + rax + %1] ;1 142 movdqu xmm6, [rsi + rdx * 2 + %1] ;6 143 lea rsi, [rsi + rax] 144 movdqu xmm7, [rsi + rdx * 2 + %1] ;7 145 movdqu xmm2, [rsi + rax + %1] ;2 146 movdqu xmm3, [rsi + rax * 2 + %1] ;3 147 movdqu xmm4, [rsi + rdx + %1] ;4 148 movdqu xmm5, [rsi + rax * 4 + %1] ;5 149%endm 150 151%macro HIGH_APPLY_FILTER_8 2 152 movdqu temp, xmm4 153 movdqa xmm4, xmm0 154 punpcklwd xmm0, xmm1 155 punpckhwd xmm4, xmm1 156 movdqa xmm1, xmm6 157 punpcklwd xmm6, xmm7 158 punpckhwd xmm1, xmm7 159 movdqa xmm7, xmm2 160 punpcklwd xmm2, xmm5 161 punpckhwd xmm7, xmm5 162 163 movdqu xmm5, temp 164 movdqu temp, xmm4 165 movdqa xmm4, xmm3 166 punpcklwd xmm3, xmm5 167 punpckhwd xmm4, xmm5 168 movdqu xmm5, temp 169 170 pmaddwd xmm0, k0k1 171 pmaddwd xmm5, k0k1 172 pmaddwd xmm6, k6k7 173 pmaddwd xmm1, k6k7 174 pmaddwd xmm2, k2k5 175 pmaddwd xmm7, k2k5 176 pmaddwd xmm3, k3k4 177 pmaddwd xmm4, k3k4 178 179 paddd xmm0, xmm6 180 paddd xmm0, xmm2 181 paddd xmm0, xmm3 182 paddd xmm5, xmm1 183 paddd xmm5, xmm7 184 paddd xmm5, xmm4 185 186 paddd xmm0, krd ;rounding 187 paddd xmm5, krd 188 psrad xmm0, 7 ;shift 189 psrad xmm5, 7 190 packssdw xmm0, xmm5 ;pack back to word 191 192 ;clamp the values 193 pminsw xmm0, max 194 pmaxsw xmm0, min 195 196%if %1 197 movdqu xmm1, [rdi + %2] 198 pavgw xmm0, xmm1 199%endif 200 movdqu [rdi + %2], xmm0 201%endm 202 203SECTION .text 204 205;void aom_filter_block1d4_v8_sse2 206;( 207; unsigned char *src_ptr, 208; unsigned int src_pitch, 209; unsigned char *output_ptr, 210; unsigned int out_pitch, 211; unsigned int output_height, 212; short *filter 213;) 214global sym(aom_highbd_filter_block1d4_v8_sse2) PRIVATE 215sym(aom_highbd_filter_block1d4_v8_sse2): 216 push rbp 217 mov rbp, rsp 218 SHADOW_ARGS_TO_STACK 7 219 SAVE_XMM 7 220 push rsi 221 push rdi 222 push rbx 223 ; end prolog 224 225 ALIGN_STACK 16, rax 226 sub rsp, 16 * 7 227 %define k0k6 [rsp + 16 * 0] 228 %define k2k5 [rsp + 16 * 1] 229 %define k3k4 [rsp + 16 * 2] 230 %define k1k7 [rsp + 16 * 3] 231 %define krd [rsp + 16 * 4] 232 %define max [rsp + 16 * 5] 233 %define min [rsp + 16 * 6] 234 235 HIGH_GET_FILTERS_4 236 237 mov rsi, arg(0) ;src_ptr 238 mov rdi, arg(2) ;output_ptr 239 240 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 241 movsxd rbx, DWORD PTR arg(3) ;out_pitch 242 lea rax, [rax + rax] ;bytes per line 243 lea rbx, [rbx + rbx] 244 lea rdx, [rax + rax * 2] 245 movsxd rcx, DWORD PTR arg(4) ;output_height 246 247.loop: 248 movq xmm0, [rsi] ;load src: row 0 249 movq xmm1, [rsi + rax] ;1 250 movq xmm6, [rsi + rdx * 2] ;6 251 lea rsi, [rsi + rax] 252 movq xmm7, [rsi + rdx * 2] ;7 253 movq xmm2, [rsi + rax] ;2 254 movq xmm3, [rsi + rax * 2] ;3 255 movq xmm4, [rsi + rdx] ;4 256 movq xmm5, [rsi + rax * 4] ;5 257 258 HIGH_APPLY_FILTER_4 0 259 260 lea rdi, [rdi + rbx] 261 dec rcx 262 jnz .loop 263 264 add rsp, 16 * 7 265 pop rsp 266 pop rbx 267 ; begin epilog 268 pop rdi 269 pop rsi 270 RESTORE_XMM 271 UNSHADOW_ARGS 272 pop rbp 273 ret 274 275;void aom_filter_block1d8_v8_sse2 276;( 277; unsigned char *src_ptr, 278; unsigned int src_pitch, 279; unsigned char *output_ptr, 280; unsigned int out_pitch, 281; unsigned int output_height, 282; short *filter 283;) 284global sym(aom_highbd_filter_block1d8_v8_sse2) PRIVATE 285sym(aom_highbd_filter_block1d8_v8_sse2): 286 push rbp 287 mov rbp, rsp 288 SHADOW_ARGS_TO_STACK 7 289 SAVE_XMM 7 290 push rsi 291 push rdi 292 push rbx 293 ; end prolog 294 295 ALIGN_STACK 16, rax 296 sub rsp, 16 * 8 297 %define k0k1 [rsp + 16 * 0] 298 %define k6k7 [rsp + 16 * 1] 299 %define k2k5 [rsp + 16 * 2] 300 %define k3k4 [rsp + 16 * 3] 301 %define krd [rsp + 16 * 4] 302 %define temp [rsp + 16 * 5] 303 %define max [rsp + 16 * 6] 304 %define min [rsp + 16 * 7] 305 306 HIGH_GET_FILTERS 307 308 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 309 movsxd rbx, DWORD PTR arg(3) ;out_pitch 310 lea rax, [rax + rax] ;bytes per line 311 lea rbx, [rbx + rbx] 312 lea rdx, [rax + rax * 2] 313 movsxd rcx, DWORD PTR arg(4) ;output_height 314 315.loop: 316 LOAD_VERT_8 0 317 HIGH_APPLY_FILTER_8 0, 0 318 319 lea rdi, [rdi + rbx] 320 dec rcx 321 jnz .loop 322 323 add rsp, 16 * 8 324 pop rsp 325 pop rbx 326 ; begin epilog 327 pop rdi 328 pop rsi 329 RESTORE_XMM 330 UNSHADOW_ARGS 331 pop rbp 332 ret 333 334;void aom_filter_block1d16_v8_sse2 335;( 336; unsigned char *src_ptr, 337; unsigned int src_pitch, 338; unsigned char *output_ptr, 339; unsigned int out_pitch, 340; unsigned int output_height, 341; short *filter 342;) 343global sym(aom_highbd_filter_block1d16_v8_sse2) PRIVATE 344sym(aom_highbd_filter_block1d16_v8_sse2): 345 push rbp 346 mov rbp, rsp 347 SHADOW_ARGS_TO_STACK 7 348 SAVE_XMM 7 349 push rsi 350 push rdi 351 push rbx 352 ; end prolog 353 354 ALIGN_STACK 16, rax 355 sub rsp, 16 * 8 356 %define k0k1 [rsp + 16 * 0] 357 %define k6k7 [rsp + 16 * 1] 358 %define k2k5 [rsp + 16 * 2] 359 %define k3k4 [rsp + 16 * 3] 360 %define krd [rsp + 16 * 4] 361 %define temp [rsp + 16 * 5] 362 %define max [rsp + 16 * 6] 363 %define min [rsp + 16 * 7] 364 365 HIGH_GET_FILTERS 366 367 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 368 movsxd rbx, DWORD PTR arg(3) ;out_pitch 369 lea rax, [rax + rax] ;bytes per line 370 lea rbx, [rbx + rbx] 371 lea rdx, [rax + rax * 2] 372 movsxd rcx, DWORD PTR arg(4) ;output_height 373 374.loop: 375 LOAD_VERT_8 0 376 HIGH_APPLY_FILTER_8 0, 0 377 sub rsi, rax 378 379 LOAD_VERT_8 16 380 HIGH_APPLY_FILTER_8 0, 16 381 add rdi, rbx 382 383 dec rcx 384 jnz .loop 385 386 add rsp, 16 * 8 387 pop rsp 388 pop rbx 389 ; begin epilog 390 pop rdi 391 pop rsi 392 RESTORE_XMM 393 UNSHADOW_ARGS 394 pop rbp 395 ret 396 397;void aom_filter_block1d4_h8_sse2 398;( 399; unsigned char *src_ptr, 400; unsigned int src_pixels_per_line, 401; unsigned char *output_ptr, 402; unsigned int output_pitch, 403; unsigned int output_height, 404; short *filter 405;) 406global sym(aom_highbd_filter_block1d4_h8_sse2) PRIVATE 407sym(aom_highbd_filter_block1d4_h8_sse2): 408 push rbp 409 mov rbp, rsp 410 SHADOW_ARGS_TO_STACK 7 411 SAVE_XMM 7 412 push rsi 413 push rdi 414 ; end prolog 415 416 ALIGN_STACK 16, rax 417 sub rsp, 16 * 7 418 %define k0k6 [rsp + 16 * 0] 419 %define k2k5 [rsp + 16 * 1] 420 %define k3k4 [rsp + 16 * 2] 421 %define k1k7 [rsp + 16 * 3] 422 %define krd [rsp + 16 * 4] 423 %define max [rsp + 16 * 5] 424 %define min [rsp + 16 * 6] 425 426 HIGH_GET_FILTERS_4 427 428 mov rsi, arg(0) ;src_ptr 429 mov rdi, arg(2) ;output_ptr 430 431 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 432 movsxd rdx, DWORD PTR arg(3) ;out_pitch 433 lea rax, [rax + rax] ;bytes per line 434 lea rdx, [rdx + rdx] 435 movsxd rcx, DWORD PTR arg(4) ;output_height 436 437.loop: 438 movdqu xmm0, [rsi - 6] ;load src 439 movdqu xmm4, [rsi + 2] 440 movdqa xmm1, xmm0 441 movdqa xmm6, xmm4 442 movdqa xmm7, xmm4 443 movdqa xmm2, xmm0 444 movdqa xmm3, xmm0 445 movdqa xmm5, xmm4 446 447 psrldq xmm1, 2 448 psrldq xmm6, 4 449 psrldq xmm7, 6 450 psrldq xmm2, 4 451 psrldq xmm3, 6 452 psrldq xmm5, 2 453 454 HIGH_APPLY_FILTER_4 0 455 456 lea rsi, [rsi + rax] 457 lea rdi, [rdi + rdx] 458 dec rcx 459 jnz .loop 460 461 add rsp, 16 * 7 462 pop rsp 463 464 ; begin epilog 465 pop rdi 466 pop rsi 467 RESTORE_XMM 468 UNSHADOW_ARGS 469 pop rbp 470 ret 471 472;void aom_filter_block1d8_h8_sse2 473;( 474; unsigned char *src_ptr, 475; unsigned int src_pixels_per_line, 476; unsigned char *output_ptr, 477; unsigned int output_pitch, 478; unsigned int output_height, 479; short *filter 480;) 481global sym(aom_highbd_filter_block1d8_h8_sse2) PRIVATE 482sym(aom_highbd_filter_block1d8_h8_sse2): 483 push rbp 484 mov rbp, rsp 485 SHADOW_ARGS_TO_STACK 7 486 SAVE_XMM 7 487 push rsi 488 push rdi 489 ; end prolog 490 491 ALIGN_STACK 16, rax 492 sub rsp, 16 * 8 493 %define k0k1 [rsp + 16 * 0] 494 %define k6k7 [rsp + 16 * 1] 495 %define k2k5 [rsp + 16 * 2] 496 %define k3k4 [rsp + 16 * 3] 497 %define krd [rsp + 16 * 4] 498 %define temp [rsp + 16 * 5] 499 %define max [rsp + 16 * 6] 500 %define min [rsp + 16 * 7] 501 502 HIGH_GET_FILTERS 503 504 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 505 movsxd rdx, DWORD PTR arg(3) ;out_pitch 506 lea rax, [rax + rax] ;bytes per line 507 lea rdx, [rdx + rdx] 508 movsxd rcx, DWORD PTR arg(4) ;output_height 509 510.loop: 511 movdqu xmm0, [rsi - 6] ;load src 512 movdqu xmm1, [rsi - 4] 513 movdqu xmm2, [rsi - 2] 514 movdqu xmm3, [rsi] 515 movdqu xmm4, [rsi + 2] 516 movdqu xmm5, [rsi + 4] 517 movdqu xmm6, [rsi + 6] 518 movdqu xmm7, [rsi + 8] 519 520 HIGH_APPLY_FILTER_8 0, 0 521 522 lea rsi, [rsi + rax] 523 lea rdi, [rdi + rdx] 524 dec rcx 525 jnz .loop 526 527 add rsp, 16 * 8 528 pop rsp 529 530 ; begin epilog 531 pop rdi 532 pop rsi 533 RESTORE_XMM 534 UNSHADOW_ARGS 535 pop rbp 536 ret 537 538;void aom_filter_block1d16_h8_sse2 539;( 540; unsigned char *src_ptr, 541; unsigned int src_pixels_per_line, 542; unsigned char *output_ptr, 543; unsigned int output_pitch, 544; unsigned int output_height, 545; short *filter 546;) 547global sym(aom_highbd_filter_block1d16_h8_sse2) PRIVATE 548sym(aom_highbd_filter_block1d16_h8_sse2): 549 push rbp 550 mov rbp, rsp 551 SHADOW_ARGS_TO_STACK 7 552 SAVE_XMM 7 553 push rsi 554 push rdi 555 ; end prolog 556 557 ALIGN_STACK 16, rax 558 sub rsp, 16 * 8 559 %define k0k1 [rsp + 16 * 0] 560 %define k6k7 [rsp + 16 * 1] 561 %define k2k5 [rsp + 16 * 2] 562 %define k3k4 [rsp + 16 * 3] 563 %define krd [rsp + 16 * 4] 564 %define temp [rsp + 16 * 5] 565 %define max [rsp + 16 * 6] 566 %define min [rsp + 16 * 7] 567 568 HIGH_GET_FILTERS 569 570 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 571 movsxd rdx, DWORD PTR arg(3) ;out_pitch 572 lea rax, [rax + rax] ;bytes per line 573 lea rdx, [rdx + rdx] 574 movsxd rcx, DWORD PTR arg(4) ;output_height 575 576.loop: 577 movdqu xmm0, [rsi - 6] ;load src 578 movdqu xmm1, [rsi - 4] 579 movdqu xmm2, [rsi - 2] 580 movdqu xmm3, [rsi] 581 movdqu xmm4, [rsi + 2] 582 movdqu xmm5, [rsi + 4] 583 movdqu xmm6, [rsi + 6] 584 movdqu xmm7, [rsi + 8] 585 586 HIGH_APPLY_FILTER_8 0, 0 587 588 movdqu xmm0, [rsi + 10] ;load src 589 movdqu xmm1, [rsi + 12] 590 movdqu xmm2, [rsi + 14] 591 movdqu xmm3, [rsi + 16] 592 movdqu xmm4, [rsi + 18] 593 movdqu xmm5, [rsi + 20] 594 movdqu xmm6, [rsi + 22] 595 movdqu xmm7, [rsi + 24] 596 597 HIGH_APPLY_FILTER_8 0, 16 598 599 lea rsi, [rsi + rax] 600 lea rdi, [rdi + rdx] 601 dec rcx 602 jnz .loop 603 604 add rsp, 16 * 8 605 pop rsp 606 607 ; begin epilog 608 pop rdi 609 pop rsi 610 RESTORE_XMM 611 UNSHADOW_ARGS 612 pop rbp 613 ret 614