1;****************************************************************************** 2;* VP8 MMXEXT optimizations 3;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> 4;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 26 27fourtap_filter_hw_m: times 4 dw -6, 123 28 times 4 dw 12, -1 29 times 4 dw -9, 93 30 times 4 dw 50, -6 31 times 4 dw -6, 50 32 times 4 dw 93, -9 33 times 4 dw -1, 12 34 times 4 dw 123, -6 35 36sixtap_filter_hw_m: times 4 dw 2, -11 37 times 4 dw 108, 36 38 times 4 dw -8, 1 39 times 4 dw 3, -16 40 times 4 dw 77, 77 41 times 4 dw -16, 3 42 times 4 dw 1, -8 43 times 4 dw 36, 108 44 times 4 dw -11, 2 45 46fourtap_filter_hb_m: times 8 db -6, 123 47 times 8 db 12, -1 48 times 8 db -9, 93 49 times 8 db 50, -6 50 times 8 db -6, 50 51 times 8 db 93, -9 52 times 8 db -1, 12 53 times 8 db 123, -6 54 55sixtap_filter_hb_m: times 8 db 2, 1 56 times 8 db -11, 108 57 times 8 db 36, -8 58 times 8 db 3, 3 59 times 8 db -16, 77 60 times 8 db 77, -16 61 times 8 db 1, 2 62 times 8 db -8, 36 63 times 8 db 108, -11 64 65fourtap_filter_v_m: times 8 dw -6 66 times 8 dw 123 67 times 8 dw 12 68 times 8 dw -1 69 times 8 dw -9 70 times 8 dw 93 71 times 8 dw 50 72 times 8 dw -6 73 times 8 dw -6 74 times 8 dw 50 75 times 8 dw 93 76 times 8 dw -9 77 times 8 dw -1 78 times 8 dw 12 79 times 8 dw 123 80 times 8 dw -6 81 82sixtap_filter_v_m: times 8 dw 2 83 times 8 dw -11 84 times 8 dw 108 85 times 8 dw 36 86 times 8 dw -8 87 times 8 dw 1 88 times 8 dw 3 89 times 8 dw -16 90 times 8 dw 77 91 times 8 dw 77 92 times 8 dw -16 93 times 8 dw 3 94 times 8 dw 1 95 times 8 dw -8 96 times 8 dw 36 97 times 8 dw 108 98 times 8 dw -11 99 times 8 dw 2 100 101bilinear_filter_vw_m: times 8 dw 1 102 times 8 dw 2 103 times 8 dw 3 104 times 8 dw 4 105 times 8 dw 5 106 times 8 dw 6 107 times 8 dw 7 108 109bilinear_filter_vb_m: times 8 db 7, 1 110 times 8 db 6, 2 111 times 8 db 5, 3 112 times 8 db 4, 4 113 times 8 db 3, 5 114 times 8 db 2, 6 115 times 8 db 1, 7 116 117%ifdef PIC 118%define fourtap_filter_hw picregq 119%define sixtap_filter_hw picregq 120%define fourtap_filter_hb picregq 121%define sixtap_filter_hb picregq 122%define fourtap_filter_v picregq 123%define sixtap_filter_v picregq 124%define bilinear_filter_vw picregq 125%define bilinear_filter_vb picregq 126%define npicregs 1 127%else 128%define fourtap_filter_hw fourtap_filter_hw_m 129%define sixtap_filter_hw sixtap_filter_hw_m 130%define fourtap_filter_hb fourtap_filter_hb_m 131%define sixtap_filter_hb sixtap_filter_hb_m 132%define fourtap_filter_v fourtap_filter_v_m 133%define sixtap_filter_v sixtap_filter_v_m 134%define bilinear_filter_vw bilinear_filter_vw_m 135%define bilinear_filter_vb bilinear_filter_vb_m 136%define npicregs 0 137%endif 138 139filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 140filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 141 142filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 143filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 144filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 145 146pw_20091: times 4 dw 20091 147pw_17734: times 4 dw 17734 148 149cextern pw_3 150cextern pw_4 151cextern pw_64 152cextern pw_256 153 154SECTION .text 155 156;------------------------------------------------------------------------------- 157; subpel MC functions: 158; 159; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, 160; uint8_t *src, int srcstride, 161; int height, int mx, int my); 162;------------------------------------------------------------------------------- 163 164%macro FILTER_SSSE3 1 165cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg 166 lea mxd, [mxq*3] 167 mova m3, [filter_h6_shuf2] 168 mova m4, [filter_h6_shuf3] 169%ifdef PIC 170 lea picregq, [sixtap_filter_hb_m] 171%endif 172 mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes 173 mova m6, [sixtap_filter_hb+mxq*8-32] 174 mova m7, [sixtap_filter_hb+mxq*8-16] 175 176.nextrow: 177 movu m0, [srcq-2] 178 mova m1, m0 179 mova m2, m0 180%if mmsize == 8 181; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the 182; shuffle with a memory operand 183 punpcklbw m0, [srcq+3] 184%else 185 pshufb m0, [filter_h6_shuf1] 186%endif 187 pshufb m1, m3 188 pshufb m2, m4 189 pmaddubsw m0, m5 190 pmaddubsw m1, m6 191 pmaddubsw m2, m7 192 paddsw m0, m1 193 paddsw m0, m2 194 pmulhrsw m0, [pw_256] 195 packuswb m0, m0 196 movh [dstq], m0 ; store 197 198 ; go to next line 199 add dstq, dststrideq 200 add srcq, srcstrideq 201 dec heightd ; next row 202 jg .nextrow 203 REP_RET 204 205cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg 206 shl mxd, 4 207 mova m2, [pw_256] 208 mova m3, [filter_h2_shuf] 209 mova m4, [filter_h4_shuf] 210%ifdef PIC 211 lea picregq, [fourtap_filter_hb_m] 212%endif 213 mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes 214 mova m6, [fourtap_filter_hb+mxq] 215 216.nextrow: 217 movu m0, [srcq-1] 218 mova m1, m0 219 pshufb m0, m3 220 pshufb m1, m4 221 pmaddubsw m0, m5 222 pmaddubsw m1, m6 223 paddsw m0, m1 224 pmulhrsw m0, m2 225 packuswb m0, m0 226 movh [dstq], m0 ; store 227 228 ; go to next line 229 add dstq, dststrideq 230 add srcq, srcstrideq 231 dec heightd ; next row 232 jg .nextrow 233 REP_RET 234 235cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my 236 shl myd, 4 237%ifdef PIC 238 lea picregq, [fourtap_filter_hb_m] 239%endif 240 mova m5, [fourtap_filter_hb+myq-16] 241 mova m6, [fourtap_filter_hb+myq] 242 mova m7, [pw_256] 243 244 ; read 3 lines 245 sub srcq, srcstrideq 246 movh m0, [srcq] 247 movh m1, [srcq+ srcstrideq] 248 movh m2, [srcq+2*srcstrideq] 249 add srcq, srcstrideq 250 251.nextrow: 252 movh m3, [srcq+2*srcstrideq] ; read new row 253 mova m4, m0 254 mova m0, m1 255 punpcklbw m4, m1 256 mova m1, m2 257 punpcklbw m2, m3 258 pmaddubsw m4, m5 259 pmaddubsw m2, m6 260 paddsw m4, m2 261 mova m2, m3 262 pmulhrsw m4, m7 263 packuswb m4, m4 264 movh [dstq], m4 265 266 ; go to next line 267 add dstq, dststrideq 268 add srcq, srcstrideq 269 dec heightd ; next row 270 jg .nextrow 271 REP_RET 272 273cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my 274 lea myd, [myq*3] 275%ifdef PIC 276 lea picregq, [sixtap_filter_hb_m] 277%endif 278 lea myq, [sixtap_filter_hb+myq*8] 279 280 ; read 5 lines 281 sub srcq, srcstrideq 282 sub srcq, srcstrideq 283 movh m0, [srcq] 284 movh m1, [srcq+srcstrideq] 285 movh m2, [srcq+srcstrideq*2] 286 lea srcq, [srcq+srcstrideq*2] 287 add srcq, srcstrideq 288 movh m3, [srcq] 289 movh m4, [srcq+srcstrideq] 290 291.nextrow: 292 movh m5, [srcq+2*srcstrideq] ; read new row 293 mova m6, m0 294 punpcklbw m6, m5 295 mova m0, m1 296 punpcklbw m1, m2 297 mova m7, m3 298 punpcklbw m7, m4 299 pmaddubsw m6, [myq-48] 300 pmaddubsw m1, [myq-32] 301 pmaddubsw m7, [myq-16] 302 paddsw m6, m1 303 paddsw m6, m7 304 mova m1, m2 305 mova m2, m3 306 pmulhrsw m6, [pw_256] 307 mova m3, m4 308 packuswb m6, m6 309 mova m4, m5 310 movh [dstq], m6 311 312 ; go to next line 313 add dstq, dststrideq 314 add srcq, srcstrideq 315 dec heightd ; next row 316 jg .nextrow 317 REP_RET 318%endmacro 319 320INIT_MMX ssse3 321FILTER_SSSE3 4 322INIT_XMM ssse3 323FILTER_SSSE3 8 324 325; 4x4 block, H-only 4-tap filter 326INIT_MMX mmxext 327cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg 328 shl mxd, 4 329%ifdef PIC 330 lea picregq, [fourtap_filter_hw_m] 331%endif 332 movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words 333 movq mm5, [fourtap_filter_hw+mxq] 334 movq mm7, [pw_64] 335 pxor mm6, mm6 336 337.nextrow: 338 movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels 339 340 ; first set of 2 pixels 341 movq mm2, mm1 ; byte ABCD.. 342 punpcklbw mm1, mm6 ; byte->word ABCD 343 pshufw mm0, mm2, 9 ; byte CDEF.. 344 punpcklbw mm0, mm6 ; byte->word CDEF 345 pshufw mm3, mm1, 0x94 ; word ABBC 346 pshufw mm1, mm0, 0x94 ; word CDDE 347 pmaddwd mm3, mm4 ; multiply 2px with F0/F1 348 movq mm0, mm1 ; backup for second set of pixels 349 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 350 paddd mm3, mm1 ; finish 1st 2px 351 352 ; second set of 2 pixels, use backup of above 353 punpckhbw mm2, mm6 ; byte->word EFGH 354 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 355 pshufw mm1, mm2, 0x94 ; word EFFG 356 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 357 paddd mm0, mm1 ; finish 2nd 2px 358 359 ; merge two sets of 2 pixels into one set of 4, round/clip/store 360 packssdw mm3, mm0 ; merge dword->word (4px) 361 paddsw mm3, mm7 ; rounding 362 psraw mm3, 7 363 packuswb mm3, mm6 ; clip and word->bytes 364 movd [dstq], mm3 ; store 365 366 ; go to next line 367 add dstq, dststrideq 368 add srcq, srcstrideq 369 dec heightd ; next row 370 jg .nextrow 371 REP_RET 372 373; 4x4 block, H-only 6-tap filter 374INIT_MMX mmxext 375cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg 376 lea mxd, [mxq*3] 377%ifdef PIC 378 lea picregq, [sixtap_filter_hw_m] 379%endif 380 movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words 381 movq mm5, [sixtap_filter_hw+mxq*8-32] 382 movq mm6, [sixtap_filter_hw+mxq*8-16] 383 movq mm7, [pw_64] 384 pxor mm3, mm3 385 386.nextrow: 387 movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels 388 389 ; first set of 2 pixels 390 movq mm2, mm1 ; byte ABCD.. 391 punpcklbw mm1, mm3 ; byte->word ABCD 392 pshufw mm0, mm2, 0x9 ; byte CDEF.. 393 punpckhbw mm2, mm3 ; byte->word EFGH 394 punpcklbw mm0, mm3 ; byte->word CDEF 395 pshufw mm1, mm1, 0x94 ; word ABBC 396 pshufw mm2, mm2, 0x94 ; word EFFG 397 pmaddwd mm1, mm4 ; multiply 2px with F0/F1 398 pshufw mm3, mm0, 0x94 ; word CDDE 399 movq mm0, mm3 ; backup for second set of pixels 400 pmaddwd mm3, mm5 ; multiply 2px with F2/F3 401 paddd mm1, mm3 ; add to 1st 2px cache 402 movq mm3, mm2 ; backup for second set of pixels 403 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 404 paddd mm1, mm2 ; finish 1st 2px 405 406 ; second set of 2 pixels, use backup of above 407 movd mm2, [srcq+3] ; byte FGHI (prevent overreads) 408 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 409 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 410 paddd mm0, mm3 ; add to 2nd 2px cache 411 pxor mm3, mm3 412 punpcklbw mm2, mm3 ; byte->word FGHI 413 pshufw mm2, mm2, 0xE9 ; word GHHI 414 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 415 paddd mm0, mm2 ; finish 2nd 2px 416 417 ; merge two sets of 2 pixels into one set of 4, round/clip/store 418 packssdw mm1, mm0 ; merge dword->word (4px) 419 paddsw mm1, mm7 ; rounding 420 psraw mm1, 7 421 packuswb mm1, mm3 ; clip and word->bytes 422 movd [dstq], mm1 ; store 423 424 ; go to next line 425 add dstq, dststrideq 426 add srcq, srcstrideq 427 dec heightd ; next row 428 jg .nextrow 429 REP_RET 430 431INIT_XMM sse2 432cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg 433 shl mxd, 5 434%ifdef PIC 435 lea picregq, [fourtap_filter_v_m] 436%endif 437 lea mxq, [fourtap_filter_v+mxq-32] 438 pxor m7, m7 439 mova m4, [pw_64] 440 mova m5, [mxq+ 0] 441 mova m6, [mxq+16] 442%ifdef m8 443 mova m8, [mxq+32] 444 mova m9, [mxq+48] 445%endif 446.nextrow: 447 movq m0, [srcq-1] 448 movq m1, [srcq-0] 449 movq m2, [srcq+1] 450 movq m3, [srcq+2] 451 punpcklbw m0, m7 452 punpcklbw m1, m7 453 punpcklbw m2, m7 454 punpcklbw m3, m7 455 pmullw m0, m5 456 pmullw m1, m6 457%ifdef m8 458 pmullw m2, m8 459 pmullw m3, m9 460%else 461 pmullw m2, [mxq+32] 462 pmullw m3, [mxq+48] 463%endif 464 paddsw m0, m1 465 paddsw m2, m3 466 paddsw m0, m2 467 paddsw m0, m4 468 psraw m0, 7 469 packuswb m0, m7 470 movh [dstq], m0 ; store 471 472 ; go to next line 473 add dstq, dststrideq 474 add srcq, srcstrideq 475 dec heightd ; next row 476 jg .nextrow 477 REP_RET 478 479INIT_XMM sse2 480cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg 481 lea mxd, [mxq*3] 482 shl mxd, 4 483%ifdef PIC 484 lea picregq, [sixtap_filter_v_m] 485%endif 486 lea mxq, [sixtap_filter_v+mxq-96] 487 pxor m7, m7 488 mova m6, [pw_64] 489%ifdef m8 490 mova m8, [mxq+ 0] 491 mova m9, [mxq+16] 492 mova m10, [mxq+32] 493 mova m11, [mxq+48] 494 mova m12, [mxq+64] 495 mova m13, [mxq+80] 496%endif 497.nextrow: 498 movq m0, [srcq-2] 499 movq m1, [srcq-1] 500 movq m2, [srcq-0] 501 movq m3, [srcq+1] 502 movq m4, [srcq+2] 503 movq m5, [srcq+3] 504 punpcklbw m0, m7 505 punpcklbw m1, m7 506 punpcklbw m2, m7 507 punpcklbw m3, m7 508 punpcklbw m4, m7 509 punpcklbw m5, m7 510%ifdef m8 511 pmullw m0, m8 512 pmullw m1, m9 513 pmullw m2, m10 514 pmullw m3, m11 515 pmullw m4, m12 516 pmullw m5, m13 517%else 518 pmullw m0, [mxq+ 0] 519 pmullw m1, [mxq+16] 520 pmullw m2, [mxq+32] 521 pmullw m3, [mxq+48] 522 pmullw m4, [mxq+64] 523 pmullw m5, [mxq+80] 524%endif 525 paddsw m1, m4 526 paddsw m0, m5 527 paddsw m1, m2 528 paddsw m0, m3 529 paddsw m0, m1 530 paddsw m0, m6 531 psraw m0, 7 532 packuswb m0, m7 533 movh [dstq], m0 ; store 534 535 ; go to next line 536 add dstq, dststrideq 537 add srcq, srcstrideq 538 dec heightd ; next row 539 jg .nextrow 540 REP_RET 541 542%macro FILTER_V 1 543; 4x4 block, V-only 4-tap filter 544cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my 545 shl myd, 5 546%ifdef PIC 547 lea picregq, [fourtap_filter_v_m] 548%endif 549 lea myq, [fourtap_filter_v+myq-32] 550 mova m6, [pw_64] 551 pxor m7, m7 552 mova m5, [myq+48] 553 554 ; read 3 lines 555 sub srcq, srcstrideq 556 movh m0, [srcq] 557 movh m1, [srcq+ srcstrideq] 558 movh m2, [srcq+2*srcstrideq] 559 add srcq, srcstrideq 560 punpcklbw m0, m7 561 punpcklbw m1, m7 562 punpcklbw m2, m7 563 564.nextrow: 565 ; first calculate negative taps (to prevent losing positive overflows) 566 movh m4, [srcq+2*srcstrideq] ; read new row 567 punpcklbw m4, m7 568 mova m3, m4 569 pmullw m0, [myq+0] 570 pmullw m4, m5 571 paddsw m4, m0 572 573 ; then calculate positive taps 574 mova m0, m1 575 pmullw m1, [myq+16] 576 paddsw m4, m1 577 mova m1, m2 578 pmullw m2, [myq+32] 579 paddsw m4, m2 580 mova m2, m3 581 582 ; round/clip/store 583 paddsw m4, m6 584 psraw m4, 7 585 packuswb m4, m7 586 movh [dstq], m4 587 588 ; go to next line 589 add dstq, dststrideq 590 add srcq, srcstrideq 591 dec heightd ; next row 592 jg .nextrow 593 REP_RET 594 595 596; 4x4 block, V-only 6-tap filter 597cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my 598 shl myd, 4 599 lea myq, [myq*3] 600%ifdef PIC 601 lea picregq, [sixtap_filter_v_m] 602%endif 603 lea myq, [sixtap_filter_v+myq-96] 604 pxor m7, m7 605 606 ; read 5 lines 607 sub srcq, srcstrideq 608 sub srcq, srcstrideq 609 movh m0, [srcq] 610 movh m1, [srcq+srcstrideq] 611 movh m2, [srcq+srcstrideq*2] 612 lea srcq, [srcq+srcstrideq*2] 613 add srcq, srcstrideq 614 movh m3, [srcq] 615 movh m4, [srcq+srcstrideq] 616 punpcklbw m0, m7 617 punpcklbw m1, m7 618 punpcklbw m2, m7 619 punpcklbw m3, m7 620 punpcklbw m4, m7 621 622.nextrow: 623 ; first calculate negative taps (to prevent losing positive overflows) 624 mova m5, m1 625 pmullw m5, [myq+16] 626 mova m6, m4 627 pmullw m6, [myq+64] 628 paddsw m6, m5 629 630 ; then calculate positive taps 631 movh m5, [srcq+2*srcstrideq] ; read new row 632 punpcklbw m5, m7 633 pmullw m0, [myq+0] 634 paddsw m6, m0 635 mova m0, m1 636 mova m1, m2 637 pmullw m2, [myq+32] 638 paddsw m6, m2 639 mova m2, m3 640 pmullw m3, [myq+48] 641 paddsw m6, m3 642 mova m3, m4 643 mova m4, m5 644 pmullw m5, [myq+80] 645 paddsw m6, m5 646 647 ; round/clip/store 648 paddsw m6, [pw_64] 649 psraw m6, 7 650 packuswb m6, m7 651 movh [dstq], m6 652 653 ; go to next line 654 add dstq, dststrideq 655 add srcq, srcstrideq 656 dec heightd ; next row 657 jg .nextrow 658 REP_RET 659%endmacro 660 661INIT_MMX mmxext 662FILTER_V 4 663INIT_XMM sse2 664FILTER_V 8 665 666%macro FILTER_BILINEAR 1 667cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my 668 shl myd, 4 669%ifdef PIC 670 lea picregq, [bilinear_filter_vw_m] 671%endif 672 pxor m6, m6 673 mova m5, [bilinear_filter_vw+myq-1*16] 674 neg myq 675 mova m4, [bilinear_filter_vw+myq+7*16] 676.nextrow: 677 movh m0, [srcq+srcstrideq*0] 678 movh m1, [srcq+srcstrideq*1] 679 movh m3, [srcq+srcstrideq*2] 680 punpcklbw m0, m6 681 punpcklbw m1, m6 682 punpcklbw m3, m6 683 mova m2, m1 684 pmullw m0, m4 685 pmullw m1, m5 686 pmullw m2, m4 687 pmullw m3, m5 688 paddsw m0, m1 689 paddsw m2, m3 690 psraw m0, 2 691 psraw m2, 2 692 pavgw m0, m6 693 pavgw m2, m6 694%if mmsize == 8 695 packuswb m0, m0 696 packuswb m2, m2 697 movh [dstq+dststrideq*0], m0 698 movh [dstq+dststrideq*1], m2 699%else 700 packuswb m0, m2 701 movh [dstq+dststrideq*0], m0 702 movhps [dstq+dststrideq*1], m0 703%endif 704 705 lea dstq, [dstq+dststrideq*2] 706 lea srcq, [srcq+srcstrideq*2] 707 sub heightd, 2 708 jg .nextrow 709 REP_RET 710 711cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg 712 shl mxd, 4 713%ifdef PIC 714 lea picregq, [bilinear_filter_vw_m] 715%endif 716 pxor m6, m6 717 mova m5, [bilinear_filter_vw+mxq-1*16] 718 neg mxq 719 mova m4, [bilinear_filter_vw+mxq+7*16] 720.nextrow: 721 movh m0, [srcq+srcstrideq*0+0] 722 movh m1, [srcq+srcstrideq*0+1] 723 movh m2, [srcq+srcstrideq*1+0] 724 movh m3, [srcq+srcstrideq*1+1] 725 punpcklbw m0, m6 726 punpcklbw m1, m6 727 punpcklbw m2, m6 728 punpcklbw m3, m6 729 pmullw m0, m4 730 pmullw m1, m5 731 pmullw m2, m4 732 pmullw m3, m5 733 paddsw m0, m1 734 paddsw m2, m3 735 psraw m0, 2 736 psraw m2, 2 737 pavgw m0, m6 738 pavgw m2, m6 739%if mmsize == 8 740 packuswb m0, m0 741 packuswb m2, m2 742 movh [dstq+dststrideq*0], m0 743 movh [dstq+dststrideq*1], m2 744%else 745 packuswb m0, m2 746 movh [dstq+dststrideq*0], m0 747 movhps [dstq+dststrideq*1], m0 748%endif 749 750 lea dstq, [dstq+dststrideq*2] 751 lea srcq, [srcq+srcstrideq*2] 752 sub heightd, 2 753 jg .nextrow 754 REP_RET 755%endmacro 756 757INIT_MMX mmxext 758FILTER_BILINEAR 4 759INIT_XMM sse2 760FILTER_BILINEAR 8 761 762%macro FILTER_BILINEAR_SSSE3 1 763cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my 764 shl myd, 4 765%ifdef PIC 766 lea picregq, [bilinear_filter_vb_m] 767%endif 768 pxor m4, m4 769 mova m3, [bilinear_filter_vb+myq-16] 770.nextrow: 771 movh m0, [srcq+srcstrideq*0] 772 movh m1, [srcq+srcstrideq*1] 773 movh m2, [srcq+srcstrideq*2] 774 punpcklbw m0, m1 775 punpcklbw m1, m2 776 pmaddubsw m0, m3 777 pmaddubsw m1, m3 778 psraw m0, 2 779 psraw m1, 2 780 pavgw m0, m4 781 pavgw m1, m4 782%if mmsize==8 783 packuswb m0, m0 784 packuswb m1, m1 785 movh [dstq+dststrideq*0], m0 786 movh [dstq+dststrideq*1], m1 787%else 788 packuswb m0, m1 789 movh [dstq+dststrideq*0], m0 790 movhps [dstq+dststrideq*1], m0 791%endif 792 793 lea dstq, [dstq+dststrideq*2] 794 lea srcq, [srcq+srcstrideq*2] 795 sub heightd, 2 796 jg .nextrow 797 REP_RET 798 799cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg 800 shl mxd, 4 801%ifdef PIC 802 lea picregq, [bilinear_filter_vb_m] 803%endif 804 pxor m4, m4 805 mova m2, [filter_h2_shuf] 806 mova m3, [bilinear_filter_vb+mxq-16] 807.nextrow: 808 movu m0, [srcq+srcstrideq*0] 809 movu m1, [srcq+srcstrideq*1] 810 pshufb m0, m2 811 pshufb m1, m2 812 pmaddubsw m0, m3 813 pmaddubsw m1, m3 814 psraw m0, 2 815 psraw m1, 2 816 pavgw m0, m4 817 pavgw m1, m4 818%if mmsize==8 819 packuswb m0, m0 820 packuswb m1, m1 821 movh [dstq+dststrideq*0], m0 822 movh [dstq+dststrideq*1], m1 823%else 824 packuswb m0, m1 825 movh [dstq+dststrideq*0], m0 826 movhps [dstq+dststrideq*1], m0 827%endif 828 829 lea dstq, [dstq+dststrideq*2] 830 lea srcq, [srcq+srcstrideq*2] 831 sub heightd, 2 832 jg .nextrow 833 REP_RET 834%endmacro 835 836INIT_MMX ssse3 837FILTER_BILINEAR_SSSE3 4 838INIT_XMM ssse3 839FILTER_BILINEAR_SSSE3 8 840 841INIT_MMX mmx 842cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height 843.nextrow: 844 movq mm0, [srcq+srcstrideq*0] 845 movq mm1, [srcq+srcstrideq*1] 846 lea srcq, [srcq+srcstrideq*2] 847 movq [dstq+dststrideq*0], mm0 848 movq [dstq+dststrideq*1], mm1 849 lea dstq, [dstq+dststrideq*2] 850 sub heightd, 2 851 jg .nextrow 852 REP_RET 853 854%if ARCH_X86_32 855INIT_MMX mmx 856cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height 857.nextrow: 858 movq mm0, [srcq+srcstrideq*0+0] 859 movq mm1, [srcq+srcstrideq*0+8] 860 movq mm2, [srcq+srcstrideq*1+0] 861 movq mm3, [srcq+srcstrideq*1+8] 862 lea srcq, [srcq+srcstrideq*2] 863 movq [dstq+dststrideq*0+0], mm0 864 movq [dstq+dststrideq*0+8], mm1 865 movq [dstq+dststrideq*1+0], mm2 866 movq [dstq+dststrideq*1+8], mm3 867 lea dstq, [dstq+dststrideq*2] 868 sub heightd, 2 869 jg .nextrow 870 REP_RET 871%endif 872 873INIT_XMM sse 874cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height 875.nextrow: 876 movups xmm0, [srcq+srcstrideq*0] 877 movups xmm1, [srcq+srcstrideq*1] 878 lea srcq, [srcq+srcstrideq*2] 879 movaps [dstq+dststrideq*0], xmm0 880 movaps [dstq+dststrideq*1], xmm1 881 lea dstq, [dstq+dststrideq*2] 882 sub heightd, 2 883 jg .nextrow 884 REP_RET 885 886;----------------------------------------------------------------------------- 887; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], int stride); 888;----------------------------------------------------------------------------- 889 890%macro ADD_DC 4 891 %4 m2, [dst1q+%3] 892 %4 m3, [dst1q+strideq+%3] 893 %4 m4, [dst2q+%3] 894 %4 m5, [dst2q+strideq+%3] 895 paddusb m2, %1 896 paddusb m3, %1 897 paddusb m4, %1 898 paddusb m5, %1 899 psubusb m2, %2 900 psubusb m3, %2 901 psubusb m4, %2 902 psubusb m5, %2 903 %4 [dst1q+%3], m2 904 %4 [dst1q+strideq+%3], m3 905 %4 [dst2q+%3], m4 906 %4 [dst2q+strideq+%3], m5 907%endmacro 908 909INIT_MMX mmx 910cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride 911 ; load data 912 movd m0, [blockq] 913 914 ; calculate DC 915 paddw m0, [pw_4] 916 pxor m1, m1 917 psraw m0, 3 918 movd [blockq], m1 919 psubw m1, m0 920 packuswb m0, m0 921 packuswb m1, m1 922 punpcklbw m0, m0 923 punpcklbw m1, m1 924 punpcklwd m0, m0 925 punpcklwd m1, m1 926 927 ; add DC 928 DEFINE_ARGS dst1, dst2, stride 929 lea dst2q, [dst1q+strideq*2] 930 ADD_DC m0, m1, 0, movh 931 RET 932 933INIT_XMM sse4 934cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride 935 ; load data 936 movd m0, [blockq] 937 pxor m1, m1 938 939 ; calculate DC 940 paddw m0, [pw_4] 941 movd [blockq], m1 942 DEFINE_ARGS dst1, dst2, stride 943 lea dst2q, [dst1q+strideq*2] 944 movd m2, [dst1q] 945 movd m3, [dst1q+strideq] 946 movd m4, [dst2q] 947 movd m5, [dst2q+strideq] 948 psraw m0, 3 949 pshuflw m0, m0, 0 950 punpcklqdq m0, m0 951 punpckldq m2, m3 952 punpckldq m4, m5 953 punpcklbw m2, m1 954 punpcklbw m4, m1 955 paddw m2, m0 956 paddw m4, m0 957 packuswb m2, m4 958 movd [dst1q], m2 959 pextrd [dst1q+strideq], m2, 1 960 pextrd [dst2q], m2, 2 961 pextrd [dst2q+strideq], m2, 3 962 RET 963 964;----------------------------------------------------------------------------- 965; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], int stride); 966;----------------------------------------------------------------------------- 967 968%if ARCH_X86_32 969INIT_MMX mmx 970cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride 971 ; load data 972 movd m0, [blockq+32*0] ; A 973 movd m1, [blockq+32*2] ; C 974 punpcklwd m0, [blockq+32*1] ; A B 975 punpcklwd m1, [blockq+32*3] ; C D 976 punpckldq m0, m1 ; A B C D 977 pxor m6, m6 978 979 ; calculate DC 980 paddw m0, [pw_4] 981 movd [blockq+32*0], m6 982 movd [blockq+32*1], m6 983 movd [blockq+32*2], m6 984 movd [blockq+32*3], m6 985 psraw m0, 3 986 psubw m6, m0 987 packuswb m0, m0 988 packuswb m6, m6 989 punpcklbw m0, m0 ; AABBCCDD 990 punpcklbw m6, m6 ; AABBCCDD 991 movq m1, m0 992 movq m7, m6 993 punpcklbw m0, m0 ; AAAABBBB 994 punpckhbw m1, m1 ; CCCCDDDD 995 punpcklbw m6, m6 ; AAAABBBB 996 punpckhbw m7, m7 ; CCCCDDDD 997 998 ; add DC 999 DEFINE_ARGS dst1, dst2, stride 1000 lea dst2q, [dst1q+strideq*2] 1001 ADD_DC m0, m6, 0, mova 1002 ADD_DC m1, m7, 8, mova 1003 RET 1004%endif 1005 1006INIT_XMM sse2 1007cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride 1008 ; load data 1009 movd m0, [blockq+32*0] ; A 1010 movd m1, [blockq+32*2] ; C 1011 punpcklwd m0, [blockq+32*1] ; A B 1012 punpcklwd m1, [blockq+32*3] ; C D 1013 punpckldq m0, m1 ; A B C D 1014 pxor m1, m1 1015 1016 ; calculate DC 1017 paddw m0, [pw_4] 1018 movd [blockq+32*0], m1 1019 movd [blockq+32*1], m1 1020 movd [blockq+32*2], m1 1021 movd [blockq+32*3], m1 1022 psraw m0, 3 1023 psubw m1, m0 1024 packuswb m0, m0 1025 packuswb m1, m1 1026 punpcklbw m0, m0 1027 punpcklbw m1, m1 1028 punpcklbw m0, m0 1029 punpcklbw m1, m1 1030 1031 ; add DC 1032 DEFINE_ARGS dst1, dst2, stride 1033 lea dst2q, [dst1q+strideq*2] 1034 ADD_DC m0, m1, 0, mova 1035 RET 1036 1037;----------------------------------------------------------------------------- 1038; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], int stride); 1039;----------------------------------------------------------------------------- 1040 1041INIT_MMX mmx 1042cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride 1043 ; load data 1044 movd m0, [blockq+32*0] ; A 1045 movd m1, [blockq+32*2] ; C 1046 punpcklwd m0, [blockq+32*1] ; A B 1047 punpcklwd m1, [blockq+32*3] ; C D 1048 punpckldq m0, m1 ; A B C D 1049 pxor m6, m6 1050 1051 ; calculate DC 1052 paddw m0, [pw_4] 1053 movd [blockq+32*0], m6 1054 movd [blockq+32*1], m6 1055 movd [blockq+32*2], m6 1056 movd [blockq+32*3], m6 1057 psraw m0, 3 1058 psubw m6, m0 1059 packuswb m0, m0 1060 packuswb m6, m6 1061 punpcklbw m0, m0 ; AABBCCDD 1062 punpcklbw m6, m6 ; AABBCCDD 1063 movq m1, m0 1064 movq m7, m6 1065 punpcklbw m0, m0 ; AAAABBBB 1066 punpckhbw m1, m1 ; CCCCDDDD 1067 punpcklbw m6, m6 ; AAAABBBB 1068 punpckhbw m7, m7 ; CCCCDDDD 1069 1070 ; add DC 1071 DEFINE_ARGS dst1, dst2, stride 1072 lea dst2q, [dst1q+strideq*2] 1073 ADD_DC m0, m6, 0, mova 1074 lea dst1q, [dst1q+strideq*4] 1075 lea dst2q, [dst2q+strideq*4] 1076 ADD_DC m1, m7, 0, mova 1077 RET 1078 1079;----------------------------------------------------------------------------- 1080; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], int stride); 1081;----------------------------------------------------------------------------- 1082 1083; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) 1084; this macro assumes that m6/m7 have words for 20091/17734 loaded 1085%macro VP8_MULTIPLY_SUMSUB 4 1086 mova %3, %1 1087 mova %4, %2 1088 pmulhw %3, m6 ;20091(1) 1089 pmulhw %4, m6 ;20091(2) 1090 paddw %3, %1 1091 paddw %4, %2 1092 paddw %1, %1 1093 paddw %2, %2 1094 pmulhw %1, m7 ;35468(1) 1095 pmulhw %2, m7 ;35468(2) 1096 psubw %1, %4 1097 paddw %2, %3 1098%endmacro 1099 1100; calculate x0=%1+%3; x1=%1-%3 1101; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) 1102; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) 1103; %5/%6 are temporary registers 1104; we assume m6/m7 have constant words 20091/17734 loaded in them 1105%macro VP8_IDCT_TRANSFORM4x4_1D 6 1106 SUMSUB_BA w, %3, %1, %5 ;t0, t1 1107 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 1108 SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3 1109 SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2 1110 SWAP %4, %1 1111 SWAP %4, %3 1112%endmacro 1113 1114%macro VP8_IDCT_ADD 0 1115cglobal vp8_idct_add, 3, 3, 0, dst, block, stride 1116 ; load block data 1117 movq m0, [blockq+ 0] 1118 movq m1, [blockq+ 8] 1119 movq m2, [blockq+16] 1120 movq m3, [blockq+24] 1121 movq m6, [pw_20091] 1122 movq m7, [pw_17734] 1123%if cpuflag(sse) 1124 xorps xmm0, xmm0 1125 movaps [blockq+ 0], xmm0 1126 movaps [blockq+16], xmm0 1127%else 1128 pxor m4, m4 1129 movq [blockq+ 0], m4 1130 movq [blockq+ 8], m4 1131 movq [blockq+16], m4 1132 movq [blockq+24], m4 1133%endif 1134 1135 ; actual IDCT 1136 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 1137 TRANSPOSE4x4W 0, 1, 2, 3, 4 1138 paddw m0, [pw_4] 1139 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 1140 TRANSPOSE4x4W 0, 1, 2, 3, 4 1141 1142 ; store 1143 pxor m4, m4 1144 DEFINE_ARGS dst1, dst2, stride 1145 lea dst2q, [dst1q+2*strideq] 1146 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq 1147 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq 1148 1149 RET 1150%endmacro 1151 1152%if ARCH_X86_32 1153INIT_MMX mmx 1154VP8_IDCT_ADD 1155%endif 1156INIT_MMX sse 1157VP8_IDCT_ADD 1158 1159;----------------------------------------------------------------------------- 1160; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16]) 1161;----------------------------------------------------------------------------- 1162 1163%macro SCATTER_WHT 3 1164 movd dc1d, m%1 1165 movd dc2d, m%2 1166 mov [blockq+2*16*(0+%3)], dc1w 1167 mov [blockq+2*16*(1+%3)], dc2w 1168 shr dc1d, 16 1169 shr dc2d, 16 1170 psrlq m%1, 32 1171 psrlq m%2, 32 1172 mov [blockq+2*16*(4+%3)], dc1w 1173 mov [blockq+2*16*(5+%3)], dc2w 1174 movd dc1d, m%1 1175 movd dc2d, m%2 1176 mov [blockq+2*16*(8+%3)], dc1w 1177 mov [blockq+2*16*(9+%3)], dc2w 1178 shr dc1d, 16 1179 shr dc2d, 16 1180 mov [blockq+2*16*(12+%3)], dc1w 1181 mov [blockq+2*16*(13+%3)], dc2w 1182%endmacro 1183 1184%macro HADAMARD4_1D 4 1185 SUMSUB_BADC w, %2, %1, %4, %3 1186 SUMSUB_BADC w, %4, %2, %3, %1 1187 SWAP %1, %4, %3 1188%endmacro 1189 1190%macro VP8_DC_WHT 0 1191cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2 1192 movq m0, [dc1q] 1193 movq m1, [dc1q+8] 1194 movq m2, [dc1q+16] 1195 movq m3, [dc1q+24] 1196%if cpuflag(sse) 1197 xorps xmm0, xmm0 1198 movaps [dc1q+ 0], xmm0 1199 movaps [dc1q+16], xmm0 1200%else 1201 pxor m4, m4 1202 movq [dc1q+ 0], m4 1203 movq [dc1q+ 8], m4 1204 movq [dc1q+16], m4 1205 movq [dc1q+24], m4 1206%endif 1207 HADAMARD4_1D 0, 1, 2, 3 1208 TRANSPOSE4x4W 0, 1, 2, 3, 4 1209 paddw m0, [pw_3] 1210 HADAMARD4_1D 0, 1, 2, 3 1211 psraw m0, 3 1212 psraw m1, 3 1213 psraw m2, 3 1214 psraw m3, 3 1215 SCATTER_WHT 0, 1, 0 1216 SCATTER_WHT 2, 3, 2 1217 RET 1218%endmacro 1219 1220%if ARCH_X86_32 1221INIT_MMX mmx 1222VP8_DC_WHT 1223%endif 1224INIT_MMX sse 1225VP8_DC_WHT 1226