1;****************************************************************************** 2;* VP8 MMXEXT optimizations 3;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> 4;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 26 27fourtap_filter_hw_m: times 4 dw -6, 123 28 times 4 dw 12, -1 29 times 4 dw -9, 93 30 times 4 dw 50, -6 31 times 4 dw -6, 50 32 times 4 dw 93, -9 33 times 4 dw -1, 12 34 times 4 dw 123, -6 35 36sixtap_filter_hw_m: times 4 dw 2, -11 37 times 4 dw 108, 36 38 times 4 dw -8, 1 39 times 4 dw 3, -16 40 times 4 dw 77, 77 41 times 4 dw -16, 3 42 times 4 dw 1, -8 43 times 4 dw 36, 108 44 times 4 dw -11, 2 45 46fourtap_filter_hb_m: times 8 db -6, 123 47 times 8 db 12, -1 48 times 8 db -9, 93 49 times 8 db 50, -6 50 times 8 db -6, 50 51 times 8 db 93, -9 52 times 8 db -1, 12 53 times 8 db 123, -6 54 55sixtap_filter_hb_m: times 8 db 2, 1 56 times 8 db -11, 108 57 times 8 db 36, -8 58 times 8 db 3, 3 59 times 8 db -16, 77 60 times 8 db 77, -16 61 times 8 db 1, 2 62 times 8 db -8, 36 63 times 8 db 108, -11 64 65fourtap_filter_v_m: times 8 dw -6 66 times 8 dw 123 67 times 8 dw 12 68 times 8 dw -1 69 times 8 dw -9 70 times 8 dw 93 71 times 8 dw 50 72 times 8 dw -6 73 times 8 dw -6 74 times 8 dw 50 75 times 8 dw 93 76 times 8 dw -9 77 times 8 dw -1 78 times 8 dw 12 79 times 8 dw 123 80 times 8 dw -6 81 82sixtap_filter_v_m: times 8 dw 2 83 times 8 dw -11 84 times 8 dw 108 85 times 8 dw 36 86 times 8 dw -8 87 times 8 dw 1 88 times 8 dw 3 89 times 8 dw -16 90 times 8 dw 77 91 times 8 dw 77 92 times 8 dw -16 93 times 8 dw 3 94 times 8 dw 1 95 times 8 dw -8 96 times 8 dw 36 97 times 8 dw 108 98 times 8 dw -11 99 times 8 dw 2 100 101bilinear_filter_vw_m: times 8 dw 1 102 times 8 dw 2 103 times 8 dw 3 104 times 8 dw 4 105 times 8 dw 5 106 times 8 dw 6 107 times 8 dw 7 108 109bilinear_filter_vb_m: times 8 db 7, 1 110 times 8 db 6, 2 111 times 8 db 5, 3 112 times 8 db 4, 4 113 times 8 db 3, 5 114 times 8 db 2, 6 115 times 8 db 1, 7 116 117%ifdef PIC 118%define fourtap_filter_hw picregq 119%define sixtap_filter_hw picregq 120%define fourtap_filter_hb picregq 121%define sixtap_filter_hb picregq 122%define fourtap_filter_v picregq 123%define sixtap_filter_v picregq 124%define bilinear_filter_vw picregq 125%define bilinear_filter_vb picregq 126%define npicregs 1 127%else 128%define fourtap_filter_hw fourtap_filter_hw_m 129%define sixtap_filter_hw sixtap_filter_hw_m 130%define fourtap_filter_hb fourtap_filter_hb_m 131%define sixtap_filter_hb sixtap_filter_hb_m 132%define fourtap_filter_v fourtap_filter_v_m 133%define sixtap_filter_v sixtap_filter_v_m 134%define bilinear_filter_vw bilinear_filter_vw_m 135%define bilinear_filter_vb bilinear_filter_vb_m 136%define npicregs 0 137%endif 138 139filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 140filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 141 142filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 143filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 144filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 145 146pw_20091: times 4 dw 20091 147pw_17734: times 4 dw 17734 148 149cextern pw_3 150cextern pw_4 151cextern pw_64 152cextern pw_256 153 154SECTION .text 155 156;------------------------------------------------------------------------------- 157; subpel MC functions: 158; 159; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, ptrdiff_t deststride, 160; uint8_t *src, ptrdiff_t srcstride, 161; int height, int mx, int my); 162;------------------------------------------------------------------------------- 163 164%macro FILTER_SSSE3 1 165cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg 166 lea mxd, [mxq*3] 167 mova m3, [filter_h6_shuf2] 168 mova m4, [filter_h6_shuf3] 169%ifdef PIC 170 lea picregq, [sixtap_filter_hb_m] 171%endif 172 mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes 173 mova m6, [sixtap_filter_hb+mxq*8-32] 174 mova m7, [sixtap_filter_hb+mxq*8-16] 175 176.nextrow: 177 movu m0, [srcq-2] 178 mova m1, m0 179 mova m2, m0 180%if mmsize == 8 181; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the 182; shuffle with a memory operand 183 punpcklbw m0, [srcq+3] 184%else 185 pshufb m0, [filter_h6_shuf1] 186%endif 187 pshufb m1, m3 188 pshufb m2, m4 189 pmaddubsw m0, m5 190 pmaddubsw m1, m6 191 pmaddubsw m2, m7 192 paddsw m0, m1 193 paddsw m0, m2 194 pmulhrsw m0, [pw_256] 195 packuswb m0, m0 196 movh [dstq], m0 ; store 197 198 ; go to next line 199 add dstq, dststrideq 200 add srcq, srcstrideq 201 dec heightd ; next row 202 jg .nextrow 203 REP_RET 204 205cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg 206 shl mxd, 4 207 mova m2, [pw_256] 208 mova m3, [filter_h2_shuf] 209 mova m4, [filter_h4_shuf] 210%ifdef PIC 211 lea picregq, [fourtap_filter_hb_m] 212%endif 213 mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes 214 mova m6, [fourtap_filter_hb+mxq] 215 216.nextrow: 217 movu m0, [srcq-1] 218 mova m1, m0 219 pshufb m0, m3 220 pshufb m1, m4 221 pmaddubsw m0, m5 222 pmaddubsw m1, m6 223 paddsw m0, m1 224 pmulhrsw m0, m2 225 packuswb m0, m0 226 movh [dstq], m0 ; store 227 228 ; go to next line 229 add dstq, dststrideq 230 add srcq, srcstrideq 231 dec heightd ; next row 232 jg .nextrow 233 REP_RET 234 235cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my 236 shl myd, 4 237%ifdef PIC 238 lea picregq, [fourtap_filter_hb_m] 239%endif 240 mova m5, [fourtap_filter_hb+myq-16] 241 mova m6, [fourtap_filter_hb+myq] 242 mova m7, [pw_256] 243 244 ; read 3 lines 245 sub srcq, srcstrideq 246 movh m0, [srcq] 247 movh m1, [srcq+ srcstrideq] 248 movh m2, [srcq+2*srcstrideq] 249 add srcq, srcstrideq 250 251.nextrow: 252 movh m3, [srcq+2*srcstrideq] ; read new row 253 mova m4, m0 254 mova m0, m1 255 punpcklbw m4, m1 256 mova m1, m2 257 punpcklbw m2, m3 258 pmaddubsw m4, m5 259 pmaddubsw m2, m6 260 paddsw m4, m2 261 mova m2, m3 262 pmulhrsw m4, m7 263 packuswb m4, m4 264 movh [dstq], m4 265 266 ; go to next line 267 add dstq, dststrideq 268 add srcq, srcstrideq 269 dec heightd ; next row 270 jg .nextrow 271 REP_RET 272 273cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my 274 lea myd, [myq*3] 275%ifdef PIC 276 lea picregq, [sixtap_filter_hb_m] 277%endif 278 lea myq, [sixtap_filter_hb+myq*8] 279 280 ; read 5 lines 281 sub srcq, srcstrideq 282 sub srcq, srcstrideq 283 movh m0, [srcq] 284 movh m1, [srcq+srcstrideq] 285 movh m2, [srcq+srcstrideq*2] 286 lea srcq, [srcq+srcstrideq*2] 287 add srcq, srcstrideq 288 movh m3, [srcq] 289 movh m4, [srcq+srcstrideq] 290 291.nextrow: 292 movh m5, [srcq+2*srcstrideq] ; read new row 293 mova m6, m0 294 punpcklbw m6, m5 295 mova m0, m1 296 punpcklbw m1, m2 297 mova m7, m3 298 punpcklbw m7, m4 299 pmaddubsw m6, [myq-48] 300 pmaddubsw m1, [myq-32] 301 pmaddubsw m7, [myq-16] 302 paddsw m6, m1 303 paddsw m6, m7 304 mova m1, m2 305 mova m2, m3 306 pmulhrsw m6, [pw_256] 307 mova m3, m4 308 packuswb m6, m6 309 mova m4, m5 310 movh [dstq], m6 311 312 ; go to next line 313 add dstq, dststrideq 314 add srcq, srcstrideq 315 dec heightd ; next row 316 jg .nextrow 317 REP_RET 318%endmacro 319 320INIT_MMX ssse3 321FILTER_SSSE3 4 322INIT_XMM ssse3 323FILTER_SSSE3 8 324 325; 4x4 block, H-only 4-tap filter 326INIT_MMX mmxext 327cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg 328 shl mxd, 4 329%ifdef PIC 330 lea picregq, [fourtap_filter_hw_m] 331%endif 332 movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words 333 movq mm5, [fourtap_filter_hw+mxq] 334 movq mm7, [pw_64] 335 pxor mm6, mm6 336 337.nextrow: 338 movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels 339 340 ; first set of 2 pixels 341 movq mm2, mm1 ; byte ABCD.. 342 punpcklbw mm1, mm6 ; byte->word ABCD 343 pshufw mm0, mm2, 9 ; byte CDEF.. 344 punpcklbw mm0, mm6 ; byte->word CDEF 345 pshufw mm3, mm1, 0x94 ; word ABBC 346 pshufw mm1, mm0, 0x94 ; word CDDE 347 pmaddwd mm3, mm4 ; multiply 2px with F0/F1 348 movq mm0, mm1 ; backup for second set of pixels 349 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 350 paddd mm3, mm1 ; finish 1st 2px 351 352 ; second set of 2 pixels, use backup of above 353 punpckhbw mm2, mm6 ; byte->word EFGH 354 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 355 pshufw mm1, mm2, 0x94 ; word EFFG 356 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 357 paddd mm0, mm1 ; finish 2nd 2px 358 359 ; merge two sets of 2 pixels into one set of 4, round/clip/store 360 packssdw mm3, mm0 ; merge dword->word (4px) 361 paddsw mm3, mm7 ; rounding 362 psraw mm3, 7 363 packuswb mm3, mm6 ; clip and word->bytes 364 movd [dstq], mm3 ; store 365 366 ; go to next line 367 add dstq, dststrideq 368 add srcq, srcstrideq 369 dec heightd ; next row 370 jg .nextrow 371 REP_RET 372 373; 4x4 block, H-only 6-tap filter 374INIT_MMX mmxext 375cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg 376 lea mxd, [mxq*3] 377%ifdef PIC 378 lea picregq, [sixtap_filter_hw_m] 379%endif 380 movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words 381 movq mm5, [sixtap_filter_hw+mxq*8-32] 382 movq mm6, [sixtap_filter_hw+mxq*8-16] 383 movq mm7, [pw_64] 384 pxor mm3, mm3 385 386.nextrow: 387 movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels 388 389 ; first set of 2 pixels 390 movq mm2, mm1 ; byte ABCD.. 391 punpcklbw mm1, mm3 ; byte->word ABCD 392 pshufw mm0, mm2, 0x9 ; byte CDEF.. 393 punpckhbw mm2, mm3 ; byte->word EFGH 394 punpcklbw mm0, mm3 ; byte->word CDEF 395 pshufw mm1, mm1, 0x94 ; word ABBC 396 pshufw mm2, mm2, 0x94 ; word EFFG 397 pmaddwd mm1, mm4 ; multiply 2px with F0/F1 398 pshufw mm3, mm0, 0x94 ; word CDDE 399 movq mm0, mm3 ; backup for second set of pixels 400 pmaddwd mm3, mm5 ; multiply 2px with F2/F3 401 paddd mm1, mm3 ; add to 1st 2px cache 402 movq mm3, mm2 ; backup for second set of pixels 403 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 404 paddd mm1, mm2 ; finish 1st 2px 405 406 ; second set of 2 pixels, use backup of above 407 movd mm2, [srcq+3] ; byte FGHI (prevent overreads) 408 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 409 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 410 paddd mm0, mm3 ; add to 2nd 2px cache 411 pxor mm3, mm3 412 punpcklbw mm2, mm3 ; byte->word FGHI 413 pshufw mm2, mm2, 0xE9 ; word GHHI 414 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 415 paddd mm0, mm2 ; finish 2nd 2px 416 417 ; merge two sets of 2 pixels into one set of 4, round/clip/store 418 packssdw mm1, mm0 ; merge dword->word (4px) 419 paddsw mm1, mm7 ; rounding 420 psraw mm1, 7 421 packuswb mm1, mm3 ; clip and word->bytes 422 movd [dstq], mm1 ; store 423 424 ; go to next line 425 add dstq, dststrideq 426 add srcq, srcstrideq 427 dec heightd ; next row 428 jg .nextrow 429 REP_RET 430 431INIT_XMM sse2 432cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg 433 shl mxd, 5 434%ifdef PIC 435 lea picregq, [fourtap_filter_v_m] 436%endif 437 lea mxq, [fourtap_filter_v+mxq-32] 438 pxor m7, m7 439 mova m4, [pw_64] 440 mova m5, [mxq+ 0] 441 mova m6, [mxq+16] 442%ifdef m8 443 mova m8, [mxq+32] 444 mova m9, [mxq+48] 445%endif 446.nextrow: 447 movq m0, [srcq-1] 448 movq m1, [srcq-0] 449 movq m2, [srcq+1] 450 movq m3, [srcq+2] 451 punpcklbw m0, m7 452 punpcklbw m1, m7 453 punpcklbw m2, m7 454 punpcklbw m3, m7 455 pmullw m0, m5 456 pmullw m1, m6 457%ifdef m8 458 pmullw m2, m8 459 pmullw m3, m9 460%else 461 pmullw m2, [mxq+32] 462 pmullw m3, [mxq+48] 463%endif 464 paddsw m0, m1 465 paddsw m2, m3 466 paddsw m0, m2 467 paddsw m0, m4 468 psraw m0, 7 469 packuswb m0, m7 470 movh [dstq], m0 ; store 471 472 ; go to next line 473 add dstq, dststrideq 474 add srcq, srcstrideq 475 dec heightd ; next row 476 jg .nextrow 477 REP_RET 478 479INIT_XMM sse2 480cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg 481 lea mxd, [mxq*3] 482 shl mxd, 4 483%ifdef PIC 484 lea picregq, [sixtap_filter_v_m] 485%endif 486 lea mxq, [sixtap_filter_v+mxq-96] 487 pxor m7, m7 488 mova m6, [pw_64] 489%ifdef m8 490 mova m8, [mxq+ 0] 491 mova m9, [mxq+16] 492 mova m10, [mxq+32] 493 mova m11, [mxq+48] 494 mova m12, [mxq+64] 495 mova m13, [mxq+80] 496%endif 497.nextrow: 498 movq m0, [srcq-2] 499 movq m1, [srcq-1] 500 movq m2, [srcq-0] 501 movq m3, [srcq+1] 502 movq m4, [srcq+2] 503 movq m5, [srcq+3] 504 punpcklbw m0, m7 505 punpcklbw m1, m7 506 punpcklbw m2, m7 507 punpcklbw m3, m7 508 punpcklbw m4, m7 509 punpcklbw m5, m7 510%ifdef m8 511 pmullw m0, m8 512 pmullw m1, m9 513 pmullw m2, m10 514 pmullw m3, m11 515 pmullw m4, m12 516 pmullw m5, m13 517%else 518 pmullw m0, [mxq+ 0] 519 pmullw m1, [mxq+16] 520 pmullw m2, [mxq+32] 521 pmullw m3, [mxq+48] 522 pmullw m4, [mxq+64] 523 pmullw m5, [mxq+80] 524%endif 525 paddsw m1, m4 526 paddsw m0, m5 527 paddsw m1, m2 528 paddsw m0, m3 529 paddsw m0, m1 530 paddsw m0, m6 531 psraw m0, 7 532 packuswb m0, m7 533 movh [dstq], m0 ; store 534 535 ; go to next line 536 add dstq, dststrideq 537 add srcq, srcstrideq 538 dec heightd ; next row 539 jg .nextrow 540 REP_RET 541 542%macro FILTER_V 1 543; 4x4 block, V-only 4-tap filter 544cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my 545 shl myd, 5 546%ifdef PIC 547 lea picregq, [fourtap_filter_v_m] 548%endif 549 lea myq, [fourtap_filter_v+myq-32] 550 mova m6, [pw_64] 551 pxor m7, m7 552 mova m5, [myq+48] 553 554 ; read 3 lines 555 sub srcq, srcstrideq 556 movh m0, [srcq] 557 movh m1, [srcq+ srcstrideq] 558 movh m2, [srcq+2*srcstrideq] 559 add srcq, srcstrideq 560 punpcklbw m0, m7 561 punpcklbw m1, m7 562 punpcklbw m2, m7 563 564.nextrow: 565 ; first calculate negative taps (to prevent losing positive overflows) 566 movh m4, [srcq+2*srcstrideq] ; read new row 567 punpcklbw m4, m7 568 mova m3, m4 569 pmullw m0, [myq+0] 570 pmullw m4, m5 571 paddsw m4, m0 572 573 ; then calculate positive taps 574 mova m0, m1 575 pmullw m1, [myq+16] 576 paddsw m4, m1 577 mova m1, m2 578 pmullw m2, [myq+32] 579 paddsw m4, m2 580 mova m2, m3 581 582 ; round/clip/store 583 paddsw m4, m6 584 psraw m4, 7 585 packuswb m4, m7 586 movh [dstq], m4 587 588 ; go to next line 589 add dstq, dststrideq 590 add srcq, srcstrideq 591 dec heightd ; next row 592 jg .nextrow 593 REP_RET 594 595 596; 4x4 block, V-only 6-tap filter 597cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my 598 shl myd, 4 599 lea myq, [myq*3] 600%ifdef PIC 601 lea picregq, [sixtap_filter_v_m] 602%endif 603 lea myq, [sixtap_filter_v+myq-96] 604 pxor m7, m7 605 606 ; read 5 lines 607 sub srcq, srcstrideq 608 sub srcq, srcstrideq 609 movh m0, [srcq] 610 movh m1, [srcq+srcstrideq] 611 movh m2, [srcq+srcstrideq*2] 612 lea srcq, [srcq+srcstrideq*2] 613 add srcq, srcstrideq 614 movh m3, [srcq] 615 movh m4, [srcq+srcstrideq] 616 punpcklbw m0, m7 617 punpcklbw m1, m7 618 punpcklbw m2, m7 619 punpcklbw m3, m7 620 punpcklbw m4, m7 621 622.nextrow: 623 ; first calculate negative taps (to prevent losing positive overflows) 624 mova m5, m1 625 pmullw m5, [myq+16] 626 mova m6, m4 627 pmullw m6, [myq+64] 628 paddsw m6, m5 629 630 ; then calculate positive taps 631 movh m5, [srcq+2*srcstrideq] ; read new row 632 punpcklbw m5, m7 633 pmullw m0, [myq+0] 634 paddsw m6, m0 635 mova m0, m1 636 mova m1, m2 637 pmullw m2, [myq+32] 638 paddsw m6, m2 639 mova m2, m3 640 pmullw m3, [myq+48] 641 paddsw m6, m3 642 mova m3, m4 643 mova m4, m5 644 pmullw m5, [myq+80] 645 paddsw m6, m5 646 647 ; round/clip/store 648 paddsw m6, [pw_64] 649 psraw m6, 7 650 packuswb m6, m7 651 movh [dstq], m6 652 653 ; go to next line 654 add dstq, dststrideq 655 add srcq, srcstrideq 656 dec heightd ; next row 657 jg .nextrow 658 REP_RET 659%endmacro 660 661INIT_MMX mmxext 662FILTER_V 4 663INIT_XMM sse2 664FILTER_V 8 665 666%macro FILTER_BILINEAR 1 667%if cpuflag(ssse3) 668cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my 669 shl myd, 4 670%ifdef PIC 671 lea picregq, [bilinear_filter_vb_m] 672%endif 673 pxor m4, m4 674 mova m3, [bilinear_filter_vb+myq-16] 675.nextrow: 676 movh m0, [srcq+srcstrideq*0] 677 movh m1, [srcq+srcstrideq*1] 678 movh m2, [srcq+srcstrideq*2] 679 punpcklbw m0, m1 680 punpcklbw m1, m2 681 pmaddubsw m0, m3 682 pmaddubsw m1, m3 683 psraw m0, 2 684 psraw m1, 2 685 pavgw m0, m4 686 pavgw m1, m4 687%if mmsize==8 688 packuswb m0, m0 689 packuswb m1, m1 690 movh [dstq+dststrideq*0], m0 691 movh [dstq+dststrideq*1], m1 692%else 693 packuswb m0, m1 694 movh [dstq+dststrideq*0], m0 695 movhps [dstq+dststrideq*1], m0 696%endif 697%else ; cpuflag(ssse3) 698cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my 699 shl myd, 4 700%ifdef PIC 701 lea picregq, [bilinear_filter_vw_m] 702%endif 703 pxor m6, m6 704 mova m5, [bilinear_filter_vw+myq-1*16] 705 neg myq 706 mova m4, [bilinear_filter_vw+myq+7*16] 707.nextrow: 708 movh m0, [srcq+srcstrideq*0] 709 movh m1, [srcq+srcstrideq*1] 710 movh m3, [srcq+srcstrideq*2] 711 punpcklbw m0, m6 712 punpcklbw m1, m6 713 punpcklbw m3, m6 714 mova m2, m1 715 pmullw m0, m4 716 pmullw m1, m5 717 pmullw m2, m4 718 pmullw m3, m5 719 paddsw m0, m1 720 paddsw m2, m3 721 psraw m0, 2 722 psraw m2, 2 723 pavgw m0, m6 724 pavgw m2, m6 725%if mmsize == 8 726 packuswb m0, m0 727 packuswb m2, m2 728 movh [dstq+dststrideq*0], m0 729 movh [dstq+dststrideq*1], m2 730%else 731 packuswb m0, m2 732 movh [dstq+dststrideq*0], m0 733 movhps [dstq+dststrideq*1], m0 734%endif 735%endif ; cpuflag(ssse3) 736 737 lea dstq, [dstq+dststrideq*2] 738 lea srcq, [srcq+srcstrideq*2] 739 sub heightd, 2 740 jg .nextrow 741 REP_RET 742 743%if cpuflag(ssse3) 744cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg 745 shl mxd, 4 746%ifdef PIC 747 lea picregq, [bilinear_filter_vb_m] 748%endif 749 pxor m4, m4 750 mova m2, [filter_h2_shuf] 751 mova m3, [bilinear_filter_vb+mxq-16] 752.nextrow: 753 movu m0, [srcq+srcstrideq*0] 754 movu m1, [srcq+srcstrideq*1] 755 pshufb m0, m2 756 pshufb m1, m2 757 pmaddubsw m0, m3 758 pmaddubsw m1, m3 759 psraw m0, 2 760 psraw m1, 2 761 pavgw m0, m4 762 pavgw m1, m4 763%if mmsize==8 764 packuswb m0, m0 765 packuswb m1, m1 766 movh [dstq+dststrideq*0], m0 767 movh [dstq+dststrideq*1], m1 768%else 769 packuswb m0, m1 770 movh [dstq+dststrideq*0], m0 771 movhps [dstq+dststrideq*1], m0 772%endif 773%else ; cpuflag(ssse3) 774cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg 775 shl mxd, 4 776%ifdef PIC 777 lea picregq, [bilinear_filter_vw_m] 778%endif 779 pxor m6, m6 780 mova m5, [bilinear_filter_vw+mxq-1*16] 781 neg mxq 782 mova m4, [bilinear_filter_vw+mxq+7*16] 783.nextrow: 784 movh m0, [srcq+srcstrideq*0+0] 785 movh m1, [srcq+srcstrideq*0+1] 786 movh m2, [srcq+srcstrideq*1+0] 787 movh m3, [srcq+srcstrideq*1+1] 788 punpcklbw m0, m6 789 punpcklbw m1, m6 790 punpcklbw m2, m6 791 punpcklbw m3, m6 792 pmullw m0, m4 793 pmullw m1, m5 794 pmullw m2, m4 795 pmullw m3, m5 796 paddsw m0, m1 797 paddsw m2, m3 798 psraw m0, 2 799 psraw m2, 2 800 pavgw m0, m6 801 pavgw m2, m6 802%if mmsize == 8 803 packuswb m0, m0 804 packuswb m2, m2 805 movh [dstq+dststrideq*0], m0 806 movh [dstq+dststrideq*1], m2 807%else 808 packuswb m0, m2 809 movh [dstq+dststrideq*0], m0 810 movhps [dstq+dststrideq*1], m0 811%endif 812%endif ; cpuflag(ssse3) 813 814 lea dstq, [dstq+dststrideq*2] 815 lea srcq, [srcq+srcstrideq*2] 816 sub heightd, 2 817 jg .nextrow 818 REP_RET 819%endmacro 820 821INIT_MMX mmxext 822FILTER_BILINEAR 4 823INIT_XMM sse2 824FILTER_BILINEAR 8 825INIT_MMX ssse3 826FILTER_BILINEAR 4 827INIT_XMM ssse3 828FILTER_BILINEAR 8 829 830INIT_MMX mmx 831cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height 832.nextrow: 833 movq mm0, [srcq+srcstrideq*0] 834 movq mm1, [srcq+srcstrideq*1] 835 lea srcq, [srcq+srcstrideq*2] 836 movq [dstq+dststrideq*0], mm0 837 movq [dstq+dststrideq*1], mm1 838 lea dstq, [dstq+dststrideq*2] 839 sub heightd, 2 840 jg .nextrow 841 REP_RET 842 843%if ARCH_X86_32 844INIT_MMX mmx 845cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height 846.nextrow: 847 movq mm0, [srcq+srcstrideq*0+0] 848 movq mm1, [srcq+srcstrideq*0+8] 849 movq mm2, [srcq+srcstrideq*1+0] 850 movq mm3, [srcq+srcstrideq*1+8] 851 lea srcq, [srcq+srcstrideq*2] 852 movq [dstq+dststrideq*0+0], mm0 853 movq [dstq+dststrideq*0+8], mm1 854 movq [dstq+dststrideq*1+0], mm2 855 movq [dstq+dststrideq*1+8], mm3 856 lea dstq, [dstq+dststrideq*2] 857 sub heightd, 2 858 jg .nextrow 859 REP_RET 860%endif 861 862INIT_XMM sse 863cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height 864.nextrow: 865 movups xmm0, [srcq+srcstrideq*0] 866 movups xmm1, [srcq+srcstrideq*1] 867 lea srcq, [srcq+srcstrideq*2] 868 movaps [dstq+dststrideq*0], xmm0 869 movaps [dstq+dststrideq*1], xmm1 870 lea dstq, [dstq+dststrideq*2] 871 sub heightd, 2 872 jg .nextrow 873 REP_RET 874 875;----------------------------------------------------------------------------- 876; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride); 877;----------------------------------------------------------------------------- 878 879%macro ADD_DC 4 880 %4 m2, [dst1q+%3] 881 %4 m3, [dst1q+strideq+%3] 882 %4 m4, [dst2q+%3] 883 %4 m5, [dst2q+strideq+%3] 884 paddusb m2, %1 885 paddusb m3, %1 886 paddusb m4, %1 887 paddusb m5, %1 888 psubusb m2, %2 889 psubusb m3, %2 890 psubusb m4, %2 891 psubusb m5, %2 892 %4 [dst1q+%3], m2 893 %4 [dst1q+strideq+%3], m3 894 %4 [dst2q+%3], m4 895 %4 [dst2q+strideq+%3], m5 896%endmacro 897 898%if ARCH_X86_32 899INIT_MMX mmx 900cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride 901 ; load data 902 movd m0, [blockq] 903 904 ; calculate DC 905 paddw m0, [pw_4] 906 pxor m1, m1 907 psraw m0, 3 908 movd [blockq], m1 909 psubw m1, m0 910 packuswb m0, m0 911 packuswb m1, m1 912 punpcklbw m0, m0 913 punpcklbw m1, m1 914 punpcklwd m0, m0 915 punpcklwd m1, m1 916 917 ; add DC 918 DEFINE_ARGS dst1, dst2, stride 919 lea dst2q, [dst1q+strideq*2] 920 ADD_DC m0, m1, 0, movh 921 RET 922%endif 923 924%macro VP8_IDCT_DC_ADD 0 925cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride 926 ; load data 927 movd m0, [blockq] 928 pxor m1, m1 929 930 ; calculate DC 931 paddw m0, [pw_4] 932 movd [blockq], m1 933 DEFINE_ARGS dst1, dst2, stride 934 lea dst2q, [dst1q+strideq*2] 935 movd m2, [dst1q] 936 movd m3, [dst1q+strideq] 937 movd m4, [dst2q] 938 movd m5, [dst2q+strideq] 939 psraw m0, 3 940 pshuflw m0, m0, 0 941 punpcklqdq m0, m0 942 punpckldq m2, m3 943 punpckldq m4, m5 944 punpcklbw m2, m1 945 punpcklbw m4, m1 946 paddw m2, m0 947 paddw m4, m0 948 packuswb m2, m4 949 movd [dst1q], m2 950%if cpuflag(sse4) 951 pextrd [dst1q+strideq], m2, 1 952 pextrd [dst2q], m2, 2 953 pextrd [dst2q+strideq], m2, 3 954%else 955 psrldq m2, 4 956 movd [dst1q+strideq], m2 957 psrldq m2, 4 958 movd [dst2q], m2 959 psrldq m2, 4 960 movd [dst2q+strideq], m2 961%endif 962 RET 963%endmacro 964 965INIT_XMM sse2 966VP8_IDCT_DC_ADD 967INIT_XMM sse4 968VP8_IDCT_DC_ADD 969 970;----------------------------------------------------------------------------- 971; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); 972;----------------------------------------------------------------------------- 973 974%if ARCH_X86_32 975INIT_MMX mmx 976cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride 977 ; load data 978 movd m0, [blockq+32*0] ; A 979 movd m1, [blockq+32*2] ; C 980 punpcklwd m0, [blockq+32*1] ; A B 981 punpcklwd m1, [blockq+32*3] ; C D 982 punpckldq m0, m1 ; A B C D 983 pxor m6, m6 984 985 ; calculate DC 986 paddw m0, [pw_4] 987 movd [blockq+32*0], m6 988 movd [blockq+32*1], m6 989 movd [blockq+32*2], m6 990 movd [blockq+32*3], m6 991 psraw m0, 3 992 psubw m6, m0 993 packuswb m0, m0 994 packuswb m6, m6 995 punpcklbw m0, m0 ; AABBCCDD 996 punpcklbw m6, m6 ; AABBCCDD 997 movq m1, m0 998 movq m7, m6 999 punpcklbw m0, m0 ; AAAABBBB 1000 punpckhbw m1, m1 ; CCCCDDDD 1001 punpcklbw m6, m6 ; AAAABBBB 1002 punpckhbw m7, m7 ; CCCCDDDD 1003 1004 ; add DC 1005 DEFINE_ARGS dst1, dst2, stride 1006 lea dst2q, [dst1q+strideq*2] 1007 ADD_DC m0, m6, 0, mova 1008 ADD_DC m1, m7, 8, mova 1009 RET 1010%endif 1011 1012INIT_XMM sse2 1013cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride 1014 ; load data 1015 movd m0, [blockq+32*0] ; A 1016 movd m1, [blockq+32*2] ; C 1017 punpcklwd m0, [blockq+32*1] ; A B 1018 punpcklwd m1, [blockq+32*3] ; C D 1019 punpckldq m0, m1 ; A B C D 1020 pxor m1, m1 1021 1022 ; calculate DC 1023 paddw m0, [pw_4] 1024 movd [blockq+32*0], m1 1025 movd [blockq+32*1], m1 1026 movd [blockq+32*2], m1 1027 movd [blockq+32*3], m1 1028 psraw m0, 3 1029 psubw m1, m0 1030 packuswb m0, m0 1031 packuswb m1, m1 1032 punpcklbw m0, m0 1033 punpcklbw m1, m1 1034 punpcklbw m0, m0 1035 punpcklbw m1, m1 1036 1037 ; add DC 1038 DEFINE_ARGS dst1, dst2, stride 1039 lea dst2q, [dst1q+strideq*2] 1040 ADD_DC m0, m1, 0, mova 1041 RET 1042 1043;----------------------------------------------------------------------------- 1044; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); 1045;----------------------------------------------------------------------------- 1046 1047INIT_MMX mmx 1048cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride 1049 ; load data 1050 movd m0, [blockq+32*0] ; A 1051 movd m1, [blockq+32*2] ; C 1052 punpcklwd m0, [blockq+32*1] ; A B 1053 punpcklwd m1, [blockq+32*3] ; C D 1054 punpckldq m0, m1 ; A B C D 1055 pxor m6, m6 1056 1057 ; calculate DC 1058 paddw m0, [pw_4] 1059 movd [blockq+32*0], m6 1060 movd [blockq+32*1], m6 1061 movd [blockq+32*2], m6 1062 movd [blockq+32*3], m6 1063 psraw m0, 3 1064 psubw m6, m0 1065 packuswb m0, m0 1066 packuswb m6, m6 1067 punpcklbw m0, m0 ; AABBCCDD 1068 punpcklbw m6, m6 ; AABBCCDD 1069 movq m1, m0 1070 movq m7, m6 1071 punpcklbw m0, m0 ; AAAABBBB 1072 punpckhbw m1, m1 ; CCCCDDDD 1073 punpcklbw m6, m6 ; AAAABBBB 1074 punpckhbw m7, m7 ; CCCCDDDD 1075 1076 ; add DC 1077 DEFINE_ARGS dst1, dst2, stride 1078 lea dst2q, [dst1q+strideq*2] 1079 ADD_DC m0, m6, 0, mova 1080 lea dst1q, [dst1q+strideq*4] 1081 lea dst2q, [dst2q+strideq*4] 1082 ADD_DC m1, m7, 0, mova 1083 RET 1084 1085;----------------------------------------------------------------------------- 1086; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride); 1087;----------------------------------------------------------------------------- 1088 1089; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) 1090; this macro assumes that m6/m7 have words for 20091/17734 loaded 1091%macro VP8_MULTIPLY_SUMSUB 4 1092 mova %3, %1 1093 mova %4, %2 1094 pmulhw %3, m6 ;20091(1) 1095 pmulhw %4, m6 ;20091(2) 1096 paddw %3, %1 1097 paddw %4, %2 1098 paddw %1, %1 1099 paddw %2, %2 1100 pmulhw %1, m7 ;35468(1) 1101 pmulhw %2, m7 ;35468(2) 1102 psubw %1, %4 1103 paddw %2, %3 1104%endmacro 1105 1106; calculate x0=%1+%3; x1=%1-%3 1107; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) 1108; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) 1109; %5/%6 are temporary registers 1110; we assume m6/m7 have constant words 20091/17734 loaded in them 1111%macro VP8_IDCT_TRANSFORM4x4_1D 6 1112 SUMSUB_BA w, %3, %1, %5 ;t0, t1 1113 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 1114 SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3 1115 SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2 1116 SWAP %4, %1 1117 SWAP %4, %3 1118%endmacro 1119 1120%macro VP8_IDCT_ADD 0 1121cglobal vp8_idct_add, 3, 3, 0, dst, block, stride 1122 ; load block data 1123 movq m0, [blockq+ 0] 1124 movq m1, [blockq+ 8] 1125 movq m2, [blockq+16] 1126 movq m3, [blockq+24] 1127 movq m6, [pw_20091] 1128 movq m7, [pw_17734] 1129%if cpuflag(sse) 1130 xorps xmm0, xmm0 1131 movaps [blockq+ 0], xmm0 1132 movaps [blockq+16], xmm0 1133%else 1134 pxor m4, m4 1135 movq [blockq+ 0], m4 1136 movq [blockq+ 8], m4 1137 movq [blockq+16], m4 1138 movq [blockq+24], m4 1139%endif 1140 1141 ; actual IDCT 1142 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 1143 TRANSPOSE4x4W 0, 1, 2, 3, 4 1144 paddw m0, [pw_4] 1145 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 1146 TRANSPOSE4x4W 0, 1, 2, 3, 4 1147 1148 ; store 1149 pxor m4, m4 1150 DEFINE_ARGS dst1, dst2, stride 1151 lea dst2q, [dst1q+2*strideq] 1152 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq 1153 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq 1154 1155 RET 1156%endmacro 1157 1158%if ARCH_X86_32 1159INIT_MMX mmx 1160VP8_IDCT_ADD 1161%endif 1162INIT_MMX sse 1163VP8_IDCT_ADD 1164 1165;----------------------------------------------------------------------------- 1166; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16]) 1167;----------------------------------------------------------------------------- 1168 1169%macro SCATTER_WHT 3 1170 movd dc1d, m%1 1171 movd dc2d, m%2 1172 mov [blockq+2*16*(0+%3)], dc1w 1173 mov [blockq+2*16*(1+%3)], dc2w 1174 shr dc1d, 16 1175 shr dc2d, 16 1176 psrlq m%1, 32 1177 psrlq m%2, 32 1178 mov [blockq+2*16*(4+%3)], dc1w 1179 mov [blockq+2*16*(5+%3)], dc2w 1180 movd dc1d, m%1 1181 movd dc2d, m%2 1182 mov [blockq+2*16*(8+%3)], dc1w 1183 mov [blockq+2*16*(9+%3)], dc2w 1184 shr dc1d, 16 1185 shr dc2d, 16 1186 mov [blockq+2*16*(12+%3)], dc1w 1187 mov [blockq+2*16*(13+%3)], dc2w 1188%endmacro 1189 1190%macro HADAMARD4_1D 4 1191 SUMSUB_BADC w, %2, %1, %4, %3 1192 SUMSUB_BADC w, %4, %2, %3, %1 1193 SWAP %1, %4, %3 1194%endmacro 1195 1196%macro VP8_DC_WHT 0 1197cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2 1198 movq m0, [dc1q] 1199 movq m1, [dc1q+8] 1200 movq m2, [dc1q+16] 1201 movq m3, [dc1q+24] 1202%if cpuflag(sse) 1203 xorps xmm0, xmm0 1204 movaps [dc1q+ 0], xmm0 1205 movaps [dc1q+16], xmm0 1206%else 1207 pxor m4, m4 1208 movq [dc1q+ 0], m4 1209 movq [dc1q+ 8], m4 1210 movq [dc1q+16], m4 1211 movq [dc1q+24], m4 1212%endif 1213 HADAMARD4_1D 0, 1, 2, 3 1214 TRANSPOSE4x4W 0, 1, 2, 3, 4 1215 paddw m0, [pw_3] 1216 HADAMARD4_1D 0, 1, 2, 3 1217 psraw m0, 3 1218 psraw m1, 3 1219 psraw m2, 3 1220 psraw m3, 3 1221 SCATTER_WHT 0, 1, 0 1222 SCATTER_WHT 2, 3, 2 1223 RET 1224%endmacro 1225 1226%if ARCH_X86_32 1227INIT_MMX mmx 1228VP8_DC_WHT 1229%endif 1230INIT_MMX sse 1231VP8_DC_WHT 1232