1;****************************************************************************** 2;* VP8 MMXEXT optimizations 3;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> 4;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 26 27pw_27: times 8 dw 27 28pw_63: times 8 dw 63 29 30pb_4: times 16 db 4 31pb_F8: times 16 db 0xF8 32pb_FE: times 16 db 0xFE 33pb_27_63: times 8 db 27, 63 34pb_18_63: times 8 db 18, 63 35pb_9_63: times 8 db 9, 63 36 37cextern pb_1 38cextern pb_3 39cextern pw_9 40cextern pw_18 41cextern pb_80 42 43SECTION .text 44 45;----------------------------------------------------------------------------- 46; void ff_vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, ptrdiff_t stride, int flim); 47;----------------------------------------------------------------------------- 48 49; macro called with 7 mm register indexes as argument, and 4 regular registers 50; 51; first 4 mm registers will carry the transposed pixel data 52; the other three are scratchspace (one would be sufficient, but this allows 53; for more spreading/pipelining and thus faster execution on OOE CPUs) 54; 55; first two regular registers are buf+4*stride and buf+5*stride 56; third is -stride, fourth is +stride 57%macro READ_8x4_INTERLEAVED 11 58 ; interleave 8 (A-H) rows of 4 pixels each 59 movd m%1, [%8+%10*4] ; A0-3 60 movd m%5, [%9+%10*4] ; B0-3 61 movd m%2, [%8+%10*2] ; C0-3 62 movd m%6, [%8+%10] ; D0-3 63 movd m%3, [%8] ; E0-3 64 movd m%7, [%9] ; F0-3 65 movd m%4, [%9+%11] ; G0-3 66 punpcklbw m%1, m%5 ; A/B interleaved 67 movd m%5, [%9+%11*2] ; H0-3 68 punpcklbw m%2, m%6 ; C/D interleaved 69 punpcklbw m%3, m%7 ; E/F interleaved 70 punpcklbw m%4, m%5 ; G/H interleaved 71%endmacro 72 73; macro called with 7 mm register indexes as argument, and 5 regular registers 74; first 11 mean the same as READ_8x4_TRANSPOSED above 75; fifth regular register is scratchspace to reach the bottom 8 rows, it 76; will be set to second regular register + 8*stride at the end 77%macro READ_16x4_INTERLEAVED 12 78 ; transpose 16 (A-P) rows of 4 pixels each 79 lea %12, [r0+8*r2] 80 81 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M 82 movd m%1, [%8+%10*4] ; A0-3 83 movd m%3, [%12+%10*4] ; I0-3 84 movd m%2, [%8+%10*2] ; C0-3 85 movd m%4, [%12+%10*2] ; K0-3 86 movd m%6, [%8+%10] ; D0-3 87 movd m%5, [%12+%10] ; L0-3 88 movd m%7, [%12] ; M0-3 89 add %12, %11 90 punpcklbw m%1, m%3 ; A/I 91 movd m%3, [%8] ; E0-3 92 punpcklbw m%2, m%4 ; C/K 93 punpcklbw m%6, m%5 ; D/L 94 punpcklbw m%3, m%7 ; E/M 95 punpcklbw m%2, m%6 ; C/D/K/L interleaved 96 97 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P 98 movd m%5, [%9+%10*4] ; B0-3 99 movd m%4, [%12+%10*4] ; J0-3 100 movd m%7, [%9] ; F0-3 101 movd m%6, [%12] ; N0-3 102 punpcklbw m%5, m%4 ; B/J 103 punpcklbw m%7, m%6 ; F/N 104 punpcklbw m%1, m%5 ; A/B/I/J interleaved 105 punpcklbw m%3, m%7 ; E/F/M/N interleaved 106 movd m%4, [%9+%11] ; G0-3 107 movd m%6, [%12+%11] ; O0-3 108 movd m%5, [%9+%11*2] ; H0-3 109 movd m%7, [%12+%11*2] ; P0-3 110 punpcklbw m%4, m%6 ; G/O 111 punpcklbw m%5, m%7 ; H/P 112 punpcklbw m%4, m%5 ; G/H/O/P interleaved 113%endmacro 114 115; write 4 mm registers of 2 dwords each 116; first four arguments are mm register indexes containing source data 117; last four are registers containing buf+4*stride, buf+5*stride, 118; -stride and +stride 119%macro WRITE_4x2D 8 120 ; write out (2 dwords per register) 121 movd [%5+%7*4], m%1 122 movd [%5+%7*2], m%2 123 movd [%5], m%3 124 movd [%6+%8], m%4 125 punpckhdq m%1, m%1 126 punpckhdq m%2, m%2 127 punpckhdq m%3, m%3 128 punpckhdq m%4, m%4 129 movd [%6+%7*4], m%1 130 movd [%5+%7], m%2 131 movd [%6], m%3 132 movd [%6+%8*2], m%4 133%endmacro 134 135; write 4 xmm registers of 4 dwords each 136; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular 137; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride 138; we add 1*stride to the third regular registry in the process 139; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the 140; same memory region), or 8 if they cover two separate buffers (third one points to 141; a different memory region than the first two), allowing for more optimal code for 142; the 16-width case 143%macro WRITE_4x4D 10 144 ; write out (4 dwords per register), start with dwords zero 145 movd [%5+%8*4], m%1 146 movd [%5], m%2 147 movd [%7+%8*4], m%3 148 movd [%7], m%4 149 150 ; store dwords 1 151 psrldq m%1, 4 152 psrldq m%2, 4 153 psrldq m%3, 4 154 psrldq m%4, 4 155 movd [%6+%8*4], m%1 156 movd [%6], m%2 157%if %10 == 16 158 movd [%6+%9*4], m%3 159%endif 160 movd [%7+%9], m%4 161 162 ; write dwords 2 163 psrldq m%1, 4 164 psrldq m%2, 4 165%if %10 == 8 166 movd [%5+%8*2], m%1 167 movd %5d, m%3 168%endif 169 psrldq m%3, 4 170 psrldq m%4, 4 171%if %10 == 16 172 movd [%5+%8*2], m%1 173%endif 174 movd [%6+%9], m%2 175 movd [%7+%8*2], m%3 176 movd [%7+%9*2], m%4 177 add %7, %9 178 179 ; store dwords 3 180 psrldq m%1, 4 181 psrldq m%2, 4 182 psrldq m%3, 4 183 psrldq m%4, 4 184%if %10 == 8 185 mov [%7+%8*4], %5d 186 movd [%6+%8*2], m%1 187%else 188 movd [%5+%8], m%1 189%endif 190 movd [%6+%9*2], m%2 191 movd [%7+%8*2], m%3 192 movd [%7+%9*2], m%4 193%endmacro 194 195; write 4 or 8 words in the mmx/xmm registers as 8 lines 196; 1 and 2 are the registers to write, this can be the same (for SSE2) 197; for pre-SSE4: 198; 3 is a general-purpose register that we will clobber 199; for SSE4: 200; 3 is a pointer to the destination's 5th line 201; 4 is a pointer to the destination's 4th line 202; 5/6 is -stride and +stride 203%macro WRITE_2x4W 6 204 movd %3d, %1 205 punpckhdq %1, %1 206 mov [%4+%5*4], %3w 207 shr %3, 16 208 add %4, %6 209 mov [%4+%5*4], %3w 210 211 movd %3d, %1 212 add %4, %5 213 mov [%4+%5*2], %3w 214 shr %3, 16 215 mov [%4+%5 ], %3w 216 217 movd %3d, %2 218 punpckhdq %2, %2 219 mov [%4 ], %3w 220 shr %3, 16 221 mov [%4+%6 ], %3w 222 223 movd %3d, %2 224 add %4, %6 225 mov [%4+%6 ], %3w 226 shr %3, 16 227 mov [%4+%6*2], %3w 228 add %4, %5 229%endmacro 230 231%macro WRITE_8W 5 232%if cpuflag(sse4) 233 pextrw [%3+%4*4], %1, 0 234 pextrw [%2+%4*4], %1, 1 235 pextrw [%3+%4*2], %1, 2 236 pextrw [%3+%4 ], %1, 3 237 pextrw [%3 ], %1, 4 238 pextrw [%2 ], %1, 5 239 pextrw [%2+%5 ], %1, 6 240 pextrw [%2+%5*2], %1, 7 241%else 242 movd %2d, %1 243 psrldq %1, 4 244 mov [%3+%4*4], %2w 245 shr %2, 16 246 add %3, %5 247 mov [%3+%4*4], %2w 248 249 movd %2d, %1 250 psrldq %1, 4 251 add %3, %4 252 mov [%3+%4*2], %2w 253 shr %2, 16 254 mov [%3+%4 ], %2w 255 256 movd %2d, %1 257 psrldq %1, 4 258 mov [%3 ], %2w 259 shr %2, 16 260 mov [%3+%5 ], %2w 261 262 movd %2d, %1 263 add %3, %5 264 mov [%3+%5 ], %2w 265 shr %2, 16 266 mov [%3+%5*2], %2w 267%endif 268%endmacro 269 270%macro SIMPLE_LOOPFILTER 2 271cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr 272%if mmsize == 8 ; mmx/mmxext 273 mov cntrq, 2 274%endif 275%if cpuflag(ssse3) 276 pxor m0, m0 277%endif 278 SPLATB_REG m7, flim, m0 ; splat "flim" into register 279 280 ; set up indexes to address 4 rows 281%if mmsize == 8 282 DEFINE_ARGS dst1, mstride, stride, cntr, dst2 283%else 284 DEFINE_ARGS dst1, mstride, stride, dst3, dst2 285%endif 286 mov strideq, mstrideq 287 neg mstrideq 288%ifidn %1, h 289 lea dst1q, [dst1q+4*strideq-2] 290%endif 291 292%if mmsize == 8 ; mmx / mmxext 293.next8px: 294%endif 295%ifidn %1, v 296 ; read 4 half/full rows of pixels 297 mova m0, [dst1q+mstrideq*2] ; p1 298 mova m1, [dst1q+mstrideq] ; p0 299 mova m2, [dst1q] ; q0 300 mova m3, [dst1q+ strideq] ; q1 301%else ; h 302 lea dst2q, [dst1q+ strideq] 303 304%if mmsize == 8 ; mmx/mmxext 305 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq 306%else ; sse2 307 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q 308%endif 309 TRANSPOSE4x4W 0, 1, 2, 3, 4 310%endif 311 312 ; simple_limit 313 mova m5, m2 ; m5=backup of q0 314 mova m6, m1 ; m6=backup of p0 315 psubusb m1, m2 ; p0-q0 316 psubusb m2, m6 ; q0-p0 317 por m1, m2 ; FFABS(p0-q0) 318 paddusb m1, m1 ; m1=FFABS(p0-q0)*2 319 320 mova m4, m3 321 mova m2, m0 322 psubusb m3, m0 ; q1-p1 323 psubusb m0, m4 ; p1-q1 324 por m3, m0 ; FFABS(p1-q1) 325 mova m0, [pb_80] 326 pxor m2, m0 327 pxor m4, m0 328 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below 329 pand m3, [pb_FE] 330 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed 331 paddusb m3, m1 332 psubusb m3, m7 333 pxor m1, m1 334 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) 335 336 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) 337 mova m4, m5 338 pxor m5, m0 339 pxor m0, m6 340 psubsb m5, m0 ; q0-p0 (signed) 341 paddsb m2, m5 342 paddsb m2, m5 343 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) 344 pand m2, m3 ; apply filter mask (m3) 345 346 mova m3, [pb_F8] 347 mova m1, m2 348 paddsb m2, [pb_4] ; f1<<3=a+4 349 paddsb m1, [pb_3] ; f2<<3=a+3 350 pand m2, m3 351 pand m1, m3 ; cache f2<<3 352 353 pxor m0, m0 354 pxor m3, m3 355 pcmpgtb m0, m2 ; which values are <0? 356 psubb m3, m2 ; -f1<<3 357 psrlq m2, 3 ; +f1 358 psrlq m3, 3 ; -f1 359 pand m3, m0 360 pandn m0, m2 361 psubusb m4, m0 362 paddusb m4, m3 ; q0-f1 363 364 pxor m0, m0 365 pxor m3, m3 366 pcmpgtb m0, m1 ; which values are <0? 367 psubb m3, m1 ; -f2<<3 368 psrlq m1, 3 ; +f2 369 psrlq m3, 3 ; -f2 370 pand m3, m0 371 pandn m0, m1 372 paddusb m6, m0 373 psubusb m6, m3 ; p0+f2 374 375 ; store 376%ifidn %1, v 377 mova [dst1q], m4 378 mova [dst1q+mstrideq], m6 379%else ; h 380 inc dst1q 381 SBUTTERFLY bw, 6, 4, 0 382 383%if mmsize == 16 ; sse2 384%if cpuflag(sse4) 385 inc dst2q 386%endif 387 WRITE_8W m6, dst2q, dst1q, mstrideq, strideq 388 lea dst2q, [dst3q+mstrideq+1] 389%if cpuflag(sse4) 390 inc dst3q 391%endif 392 WRITE_8W m4, dst3q, dst2q, mstrideq, strideq 393%else ; mmx/mmxext 394 WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq 395%endif 396%endif 397 398%if mmsize == 8 ; mmx/mmxext 399 ; next 8 pixels 400%ifidn %1, v 401 add dst1q, 8 ; advance 8 cols = pixels 402%else ; h 403 lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines 404%endif 405 dec cntrq 406 jg .next8px 407 REP_RET 408%else ; sse2 409 RET 410%endif 411%endmacro 412 413%if ARCH_X86_32 414INIT_MMX mmx 415SIMPLE_LOOPFILTER v, 4 416SIMPLE_LOOPFILTER h, 5 417INIT_MMX mmxext 418SIMPLE_LOOPFILTER v, 4 419SIMPLE_LOOPFILTER h, 5 420%endif 421 422INIT_XMM sse2 423SIMPLE_LOOPFILTER v, 3 424SIMPLE_LOOPFILTER h, 5 425INIT_XMM ssse3 426SIMPLE_LOOPFILTER v, 3 427SIMPLE_LOOPFILTER h, 5 428INIT_XMM sse4 429SIMPLE_LOOPFILTER h, 5 430 431;----------------------------------------------------------------------------- 432; void ff_vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] ptrdiff_t stride, 433; int flimE, int flimI, int hev_thr); 434;----------------------------------------------------------------------------- 435 436%macro INNER_LOOPFILTER 2 437%define stack_size 0 438%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr 439%ifidn %1, v ; [3]=hev() result 440%define stack_size mmsize * -4 441%else ; h ; extra storage space for transposes 442%define stack_size mmsize * -5 443%endif 444%endif 445 446%if %2 == 8 ; chroma 447cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr 448%else ; luma 449cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr 450%endif 451 452%if cpuflag(ssse3) 453 pxor m7, m7 454%endif 455 456%ifndef m8 457 ; splat function arguments 458 SPLATB_REG m0, flimEq, m7 ; E 459 SPLATB_REG m1, flimIq, m7 ; I 460 SPLATB_REG m2, hevthrq, m7 ; hev_thresh 461 462%define m_flimE [rsp] 463%define m_flimI [rsp+mmsize] 464%define m_hevthr [rsp+mmsize*2] 465%define m_maskres [rsp+mmsize*3] 466%define m_p0backup [rsp+mmsize*3] 467%define m_q0backup [rsp+mmsize*4] 468 469 mova m_flimE, m0 470 mova m_flimI, m1 471 mova m_hevthr, m2 472%else 473%define m_flimE m9 474%define m_flimI m10 475%define m_hevthr m11 476%define m_maskres m12 477%define m_p0backup m12 478%define m_q0backup m8 479 480 ; splat function arguments 481 SPLATB_REG m_flimE, flimEq, m7 ; E 482 SPLATB_REG m_flimI, flimIq, m7 ; I 483 SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh 484%endif 485 486%if %2 == 8 ; chroma 487 DEFINE_ARGS dst1, dst8, mstride, stride, dst2 488%elif mmsize == 8 489 DEFINE_ARGS dst1, mstride, stride, dst2, cntr 490 mov cntrq, 2 491%else 492 DEFINE_ARGS dst1, mstride, stride, dst2, dst8 493%endif 494 mov strideq, mstrideq 495 neg mstrideq 496%ifidn %1, h 497 lea dst1q, [dst1q+strideq*4-4] 498%if %2 == 8 ; chroma 499 lea dst8q, [dst8q+strideq*4-4] 500%endif 501%endif 502 503%if mmsize == 8 504.next8px: 505%endif 506 ; read 507 lea dst2q, [dst1q+strideq] 508%ifidn %1, v 509%if %2 == 8 && mmsize == 16 510%define movrow movh 511%else 512%define movrow mova 513%endif 514 movrow m0, [dst1q+mstrideq*4] ; p3 515 movrow m1, [dst2q+mstrideq*4] ; p2 516 movrow m2, [dst1q+mstrideq*2] ; p1 517 movrow m5, [dst2q] ; q1 518 movrow m6, [dst2q+ strideq*1] ; q2 519 movrow m7, [dst2q+ strideq*2] ; q3 520%if mmsize == 16 && %2 == 8 521 movhps m0, [dst8q+mstrideq*4] 522 movhps m2, [dst8q+mstrideq*2] 523 add dst8q, strideq 524 movhps m1, [dst8q+mstrideq*4] 525 movhps m5, [dst8q] 526 movhps m6, [dst8q+ strideq ] 527 movhps m7, [dst8q+ strideq*2] 528 add dst8q, mstrideq 529%endif 530%elif mmsize == 8 ; mmx/mmxext (h) 531 ; read 8 rows of 8px each 532 movu m0, [dst1q+mstrideq*4] 533 movu m1, [dst2q+mstrideq*4] 534 movu m2, [dst1q+mstrideq*2] 535 movu m3, [dst1q+mstrideq ] 536 movu m4, [dst1q] 537 movu m5, [dst2q] 538 movu m6, [dst2q+ strideq ] 539 540 ; 8x8 transpose 541 TRANSPOSE4x4B 0, 1, 2, 3, 7 542 mova m_q0backup, m1 543 movu m7, [dst2q+ strideq*2] 544 TRANSPOSE4x4B 4, 5, 6, 7, 1 545 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 546 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 547 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 548 mova m1, m_q0backup 549 mova m_q0backup, m2 ; store q0 550 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 551 mova m_p0backup, m5 ; store p0 552 SWAP 1, 4 553 SWAP 2, 4 554 SWAP 6, 3 555 SWAP 5, 3 556%else ; sse2 (h) 557%if %2 == 16 558 lea dst8q, [dst1q+ strideq*8] 559%endif 560 561 ; read 16 rows of 8px each, interleave 562 movh m0, [dst1q+mstrideq*4] 563 movh m1, [dst8q+mstrideq*4] 564 movh m2, [dst1q+mstrideq*2] 565 movh m5, [dst8q+mstrideq*2] 566 movh m3, [dst1q+mstrideq ] 567 movh m6, [dst8q+mstrideq ] 568 movh m4, [dst1q] 569 movh m7, [dst8q] 570 punpcklbw m0, m1 ; A/I 571 punpcklbw m2, m5 ; C/K 572 punpcklbw m3, m6 ; D/L 573 punpcklbw m4, m7 ; E/M 574 575 add dst8q, strideq 576 movh m1, [dst2q+mstrideq*4] 577 movh m6, [dst8q+mstrideq*4] 578 movh m5, [dst2q] 579 movh m7, [dst8q] 580 punpcklbw m1, m6 ; B/J 581 punpcklbw m5, m7 ; F/N 582 movh m6, [dst2q+ strideq ] 583 movh m7, [dst8q+ strideq ] 584 punpcklbw m6, m7 ; G/O 585 586 ; 8x16 transpose 587 TRANSPOSE4x4B 0, 1, 2, 3, 7 588%ifdef m8 589 SWAP 1, 8 590%else 591 mova m_q0backup, m1 592%endif 593 movh m7, [dst2q+ strideq*2] 594 movh m1, [dst8q+ strideq*2] 595 punpcklbw m7, m1 ; H/P 596 TRANSPOSE4x4B 4, 5, 6, 7, 1 597 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 598 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 599 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 600%ifdef m8 601 SWAP 1, 8 602 SWAP 2, 8 603%else 604 mova m1, m_q0backup 605 mova m_q0backup, m2 ; store q0 606%endif 607 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 608%ifdef m12 609 SWAP 5, 12 610%else 611 mova m_p0backup, m5 ; store p0 612%endif 613 SWAP 1, 4 614 SWAP 2, 4 615 SWAP 6, 3 616 SWAP 5, 3 617%endif 618 619 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 620 mova m4, m1 621 SWAP 4, 1 622 psubusb m4, m0 ; p2-p3 623 psubusb m0, m1 ; p3-p2 624 por m0, m4 ; abs(p3-p2) 625 626 mova m4, m2 627 SWAP 4, 2 628 psubusb m4, m1 ; p1-p2 629 psubusb m1, m2 ; p2-p1 630 por m1, m4 ; abs(p2-p1) 631 632 mova m4, m6 633 SWAP 4, 6 634 psubusb m4, m7 ; q2-q3 635 psubusb m7, m6 ; q3-q2 636 por m7, m4 ; abs(q3-q2) 637 638 mova m4, m5 639 SWAP 4, 5 640 psubusb m4, m6 ; q1-q2 641 psubusb m6, m5 ; q2-q1 642 por m6, m4 ; abs(q2-q1) 643 644%if notcpuflag(mmxext) 645 mova m4, m_flimI 646 pxor m3, m3 647 psubusb m0, m4 648 psubusb m1, m4 649 psubusb m7, m4 650 psubusb m6, m4 651 pcmpeqb m0, m3 ; abs(p3-p2) <= I 652 pcmpeqb m1, m3 ; abs(p2-p1) <= I 653 pcmpeqb m7, m3 ; abs(q3-q2) <= I 654 pcmpeqb m6, m3 ; abs(q2-q1) <= I 655 pand m0, m1 656 pand m7, m6 657 pand m0, m7 658%else ; mmxext/sse2 659 pmaxub m0, m1 660 pmaxub m6, m7 661 pmaxub m0, m6 662%endif 663 664 ; normal_limit and high_edge_variance for p1-p0, q1-q0 665 SWAP 7, 3 ; now m7 is zero 666%ifidn %1, v 667 movrow m3, [dst1q+mstrideq ] ; p0 668%if mmsize == 16 && %2 == 8 669 movhps m3, [dst8q+mstrideq ] 670%endif 671%elifdef m12 672 SWAP 3, 12 673%else 674 mova m3, m_p0backup 675%endif 676 677 mova m1, m2 678 SWAP 1, 2 679 mova m6, m3 680 SWAP 3, 6 681 psubusb m1, m3 ; p1-p0 682 psubusb m6, m2 ; p0-p1 683 por m1, m6 ; abs(p1-p0) 684%if notcpuflag(mmxext) 685 mova m6, m1 686 psubusb m1, m4 687 psubusb m6, m_hevthr 688 pcmpeqb m1, m7 ; abs(p1-p0) <= I 689 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh 690 pand m0, m1 691 mova m_maskres, m6 692%else ; mmxext/sse2 693 pmaxub m0, m1 ; max_I 694 SWAP 1, 4 ; max_hev_thresh 695%endif 696 697 SWAP 6, 4 ; now m6 is I 698%ifidn %1, v 699 movrow m4, [dst1q] ; q0 700%if mmsize == 16 && %2 == 8 701 movhps m4, [dst8q] 702%endif 703%elifdef m8 704 SWAP 4, 8 705%else 706 mova m4, m_q0backup 707%endif 708 mova m1, m4 709 SWAP 1, 4 710 mova m7, m5 711 SWAP 7, 5 712 psubusb m1, m5 ; q0-q1 713 psubusb m7, m4 ; q1-q0 714 por m1, m7 ; abs(q1-q0) 715%if notcpuflag(mmxext) 716 mova m7, m1 717 psubusb m1, m6 718 psubusb m7, m_hevthr 719 pxor m6, m6 720 pcmpeqb m1, m6 ; abs(q1-q0) <= I 721 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh 722 mova m6, m_maskres 723 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I 724 pand m6, m7 725%else ; mmxext/sse2 726 pxor m7, m7 727 pmaxub m0, m1 728 pmaxub m6, m1 729 psubusb m0, m_flimI 730 psubusb m6, m_hevthr 731 pcmpeqb m0, m7 ; max(abs(..)) <= I 732 pcmpeqb m6, m7 ; !(max(abs..) > thresh) 733%endif 734%ifdef m12 735 SWAP 6, 12 736%else 737 mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) 738%endif 739 740 ; simple_limit 741 mova m1, m3 742 SWAP 1, 3 743 mova m6, m4 ; keep copies of p0/q0 around for later use 744 SWAP 6, 4 745 psubusb m1, m4 ; p0-q0 746 psubusb m6, m3 ; q0-p0 747 por m1, m6 ; abs(q0-p0) 748 paddusb m1, m1 ; m1=2*abs(q0-p0) 749 750 mova m7, m2 751 SWAP 7, 2 752 mova m6, m5 753 SWAP 6, 5 754 psubusb m7, m5 ; p1-q1 755 psubusb m6, m2 ; q1-p1 756 por m7, m6 ; abs(q1-p1) 757 pxor m6, m6 758 pand m7, [pb_FE] 759 psrlq m7, 1 ; abs(q1-p1)/2 760 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 761 psubusb m7, m_flimE 762 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E 763 pand m0, m7 ; normal_limit result 764 765 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask 766%ifdef m8 ; x86-64 && sse2 767 mova m8, [pb_80] 768%define m_pb_80 m8 769%else ; x86-32 or mmx/mmxext 770%define m_pb_80 [pb_80] 771%endif 772 mova m1, m4 773 mova m7, m3 774 pxor m1, m_pb_80 775 pxor m7, m_pb_80 776 psubsb m1, m7 ; (signed) q0-p0 777 mova m6, m2 778 mova m7, m5 779 pxor m6, m_pb_80 780 pxor m7, m_pb_80 781 psubsb m6, m7 ; (signed) p1-q1 782 mova m7, m_maskres 783 pandn m7, m6 784 paddsb m7, m1 785 paddsb m7, m1 786 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) 787 788 pand m7, m0 789 mova m1, [pb_F8] 790 mova m6, m7 791 paddsb m7, [pb_3] 792 paddsb m6, [pb_4] 793 pand m7, m1 794 pand m6, m1 795 796 pxor m1, m1 797 pxor m0, m0 798 pcmpgtb m1, m7 799 psubb m0, m7 800 psrlq m7, 3 ; +f2 801 psrlq m0, 3 ; -f2 802 pand m0, m1 803 pandn m1, m7 804 psubusb m3, m0 805 paddusb m3, m1 ; p0+f2 806 807 pxor m1, m1 808 pxor m0, m0 809 pcmpgtb m0, m6 810 psubb m1, m6 811 psrlq m6, 3 ; +f1 812 psrlq m1, 3 ; -f1 813 pand m1, m0 814 pandn m0, m6 815 psubusb m4, m0 816 paddusb m4, m1 ; q0-f1 817 818%ifdef m12 819 SWAP 6, 12 820%else 821 mova m6, m_maskres 822%endif 823%if notcpuflag(mmxext) 824 mova m7, [pb_1] 825%else ; mmxext/sse2 826 pxor m7, m7 827%endif 828 pand m0, m6 829 pand m1, m6 830%if notcpuflag(mmxext) 831 paddusb m0, m7 832 pand m1, [pb_FE] 833 pandn m7, m0 834 psrlq m1, 1 835 psrlq m7, 1 836 SWAP 0, 7 837%else ; mmxext/sse2 838 psubusb m1, [pb_1] 839 pavgb m0, m7 ; a 840 pavgb m1, m7 ; -a 841%endif 842 psubusb m5, m0 843 psubusb m2, m1 844 paddusb m5, m1 ; q1-a 845 paddusb m2, m0 ; p1+a 846 847 ; store 848%ifidn %1, v 849 movrow [dst1q+mstrideq*2], m2 850 movrow [dst1q+mstrideq ], m3 851 movrow [dst1q], m4 852 movrow [dst1q+ strideq ], m5 853%if mmsize == 16 && %2 == 8 854 movhps [dst8q+mstrideq*2], m2 855 movhps [dst8q+mstrideq ], m3 856 movhps [dst8q], m4 857 movhps [dst8q+ strideq ], m5 858%endif 859%else ; h 860 add dst1q, 2 861 add dst2q, 2 862 863 ; 4x8/16 transpose 864 TRANSPOSE4x4B 2, 3, 4, 5, 6 865 866%if mmsize == 8 ; mmx/mmxext (h) 867 WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq 868%else ; sse2 (h) 869 lea dst8q, [dst8q+mstrideq +2] 870 WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2 871%endif 872%endif 873 874%if mmsize == 8 875%if %2 == 8 ; chroma 876%ifidn %1, h 877 sub dst1q, 2 878%endif 879 cmp dst1q, dst8q 880 mov dst1q, dst8q 881 jnz .next8px 882%else 883%ifidn %1, h 884 lea dst1q, [dst1q+ strideq*8-2] 885%else ; v 886 add dst1q, 8 887%endif 888 dec cntrq 889 jg .next8px 890%endif 891 REP_RET 892%else ; mmsize == 16 893 RET 894%endif 895%endmacro 896 897%if ARCH_X86_32 898INIT_MMX mmx 899INNER_LOOPFILTER v, 16 900INNER_LOOPFILTER h, 16 901INNER_LOOPFILTER v, 8 902INNER_LOOPFILTER h, 8 903 904INIT_MMX mmxext 905INNER_LOOPFILTER v, 16 906INNER_LOOPFILTER h, 16 907INNER_LOOPFILTER v, 8 908INNER_LOOPFILTER h, 8 909%endif 910 911INIT_XMM sse2 912INNER_LOOPFILTER v, 16 913INNER_LOOPFILTER h, 16 914INNER_LOOPFILTER v, 8 915INNER_LOOPFILTER h, 8 916 917INIT_XMM ssse3 918INNER_LOOPFILTER v, 16 919INNER_LOOPFILTER h, 16 920INNER_LOOPFILTER v, 8 921INNER_LOOPFILTER h, 8 922 923;----------------------------------------------------------------------------- 924; void ff_vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] ptrdiff_t stride, 925; int flimE, int flimI, int hev_thr); 926;----------------------------------------------------------------------------- 927 928%macro MBEDGE_LOOPFILTER 2 929%define stack_size 0 930%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr 931%if mmsize == 16 ; [3]=hev() result 932 ; [4]=filter tmp result 933 ; [5]/[6] = p2/q2 backup 934 ; [7]=lim_res sign result 935%define stack_size mmsize * -7 936%else ; 8 ; extra storage space for transposes 937%define stack_size mmsize * -8 938%endif 939%endif 940 941%if %2 == 8 ; chroma 942cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr 943%else ; luma 944cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr 945%endif 946 947%if cpuflag(ssse3) 948 pxor m7, m7 949%endif 950 951%ifndef m8 952 ; splat function arguments 953 SPLATB_REG m0, flimEq, m7 ; E 954 SPLATB_REG m1, flimIq, m7 ; I 955 SPLATB_REG m2, hevthrq, m7 ; hev_thresh 956 957%define m_flimE [rsp] 958%define m_flimI [rsp+mmsize] 959%define m_hevthr [rsp+mmsize*2] 960%define m_maskres [rsp+mmsize*3] 961%define m_limres [rsp+mmsize*4] 962%define m_p0backup [rsp+mmsize*3] 963%define m_q0backup [rsp+mmsize*4] 964%define m_p2backup [rsp+mmsize*5] 965%define m_q2backup [rsp+mmsize*6] 966%if mmsize == 16 967%define m_limsign [rsp] 968%else 969%define m_limsign [rsp+mmsize*7] 970%endif 971 972 mova m_flimE, m0 973 mova m_flimI, m1 974 mova m_hevthr, m2 975%else ; sse2 on x86-64 976%define m_flimE m9 977%define m_flimI m10 978%define m_hevthr m11 979%define m_maskres m12 980%define m_limres m8 981%define m_p0backup m12 982%define m_q0backup m8 983%define m_p2backup m13 984%define m_q2backup m14 985%define m_limsign m9 986 987 ; splat function arguments 988 SPLATB_REG m_flimE, flimEq, m7 ; E 989 SPLATB_REG m_flimI, flimIq, m7 ; I 990 SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh 991%endif 992 993%if %2 == 8 ; chroma 994 DEFINE_ARGS dst1, dst8, mstride, stride, dst2 995%elif mmsize == 8 996 DEFINE_ARGS dst1, mstride, stride, dst2, cntr 997 mov cntrq, 2 998%else 999 DEFINE_ARGS dst1, mstride, stride, dst2, dst8 1000%endif 1001 mov strideq, mstrideq 1002 neg mstrideq 1003%ifidn %1, h 1004 lea dst1q, [dst1q+strideq*4-4] 1005%if %2 == 8 ; chroma 1006 lea dst8q, [dst8q+strideq*4-4] 1007%endif 1008%endif 1009 1010%if mmsize == 8 1011.next8px: 1012%endif 1013 ; read 1014 lea dst2q, [dst1q+ strideq ] 1015%ifidn %1, v 1016%if %2 == 8 && mmsize == 16 1017%define movrow movh 1018%else 1019%define movrow mova 1020%endif 1021 movrow m0, [dst1q+mstrideq*4] ; p3 1022 movrow m1, [dst2q+mstrideq*4] ; p2 1023 movrow m2, [dst1q+mstrideq*2] ; p1 1024 movrow m5, [dst2q] ; q1 1025 movrow m6, [dst2q+ strideq ] ; q2 1026 movrow m7, [dst2q+ strideq*2] ; q3 1027%if mmsize == 16 && %2 == 8 1028 movhps m0, [dst8q+mstrideq*4] 1029 movhps m2, [dst8q+mstrideq*2] 1030 add dst8q, strideq 1031 movhps m1, [dst8q+mstrideq*4] 1032 movhps m5, [dst8q] 1033 movhps m6, [dst8q+ strideq ] 1034 movhps m7, [dst8q+ strideq*2] 1035 add dst8q, mstrideq 1036%endif 1037%elif mmsize == 8 ; mmx/mmxext (h) 1038 ; read 8 rows of 8px each 1039 movu m0, [dst1q+mstrideq*4] 1040 movu m1, [dst2q+mstrideq*4] 1041 movu m2, [dst1q+mstrideq*2] 1042 movu m3, [dst1q+mstrideq ] 1043 movu m4, [dst1q] 1044 movu m5, [dst2q] 1045 movu m6, [dst2q+ strideq ] 1046 1047 ; 8x8 transpose 1048 TRANSPOSE4x4B 0, 1, 2, 3, 7 1049 mova m_q0backup, m1 1050 movu m7, [dst2q+ strideq*2] 1051 TRANSPOSE4x4B 4, 5, 6, 7, 1 1052 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 1053 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 1054 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 1055 mova m1, m_q0backup 1056 mova m_q0backup, m2 ; store q0 1057 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 1058 mova m_p0backup, m5 ; store p0 1059 SWAP 1, 4 1060 SWAP 2, 4 1061 SWAP 6, 3 1062 SWAP 5, 3 1063%else ; sse2 (h) 1064%if %2 == 16 1065 lea dst8q, [dst1q+ strideq*8 ] 1066%endif 1067 1068 ; read 16 rows of 8px each, interleave 1069 movh m0, [dst1q+mstrideq*4] 1070 movh m1, [dst8q+mstrideq*4] 1071 movh m2, [dst1q+mstrideq*2] 1072 movh m5, [dst8q+mstrideq*2] 1073 movh m3, [dst1q+mstrideq ] 1074 movh m6, [dst8q+mstrideq ] 1075 movh m4, [dst1q] 1076 movh m7, [dst8q] 1077 punpcklbw m0, m1 ; A/I 1078 punpcklbw m2, m5 ; C/K 1079 punpcklbw m3, m6 ; D/L 1080 punpcklbw m4, m7 ; E/M 1081 1082 add dst8q, strideq 1083 movh m1, [dst2q+mstrideq*4] 1084 movh m6, [dst8q+mstrideq*4] 1085 movh m5, [dst2q] 1086 movh m7, [dst8q] 1087 punpcklbw m1, m6 ; B/J 1088 punpcklbw m5, m7 ; F/N 1089 movh m6, [dst2q+ strideq ] 1090 movh m7, [dst8q+ strideq ] 1091 punpcklbw m6, m7 ; G/O 1092 1093 ; 8x16 transpose 1094 TRANSPOSE4x4B 0, 1, 2, 3, 7 1095%ifdef m8 1096 SWAP 1, 8 1097%else 1098 mova m_q0backup, m1 1099%endif 1100 movh m7, [dst2q+ strideq*2] 1101 movh m1, [dst8q+ strideq*2] 1102 punpcklbw m7, m1 ; H/P 1103 TRANSPOSE4x4B 4, 5, 6, 7, 1 1104 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 1105 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 1106 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 1107%ifdef m8 1108 SWAP 1, 8 1109 SWAP 2, 8 1110%else 1111 mova m1, m_q0backup 1112 mova m_q0backup, m2 ; store q0 1113%endif 1114 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 1115%ifdef m12 1116 SWAP 5, 12 1117%else 1118 mova m_p0backup, m5 ; store p0 1119%endif 1120 SWAP 1, 4 1121 SWAP 2, 4 1122 SWAP 6, 3 1123 SWAP 5, 3 1124%endif 1125 1126 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 1127 mova m4, m1 1128 SWAP 4, 1 1129 psubusb m4, m0 ; p2-p3 1130 psubusb m0, m1 ; p3-p2 1131 por m0, m4 ; abs(p3-p2) 1132 1133 mova m4, m2 1134 SWAP 4, 2 1135 psubusb m4, m1 ; p1-p2 1136 mova m_p2backup, m1 1137 psubusb m1, m2 ; p2-p1 1138 por m1, m4 ; abs(p2-p1) 1139 1140 mova m4, m6 1141 SWAP 4, 6 1142 psubusb m4, m7 ; q2-q3 1143 psubusb m7, m6 ; q3-q2 1144 por m7, m4 ; abs(q3-q2) 1145 1146 mova m4, m5 1147 SWAP 4, 5 1148 psubusb m4, m6 ; q1-q2 1149 mova m_q2backup, m6 1150 psubusb m6, m5 ; q2-q1 1151 por m6, m4 ; abs(q2-q1) 1152 1153%if notcpuflag(mmxext) 1154 mova m4, m_flimI 1155 pxor m3, m3 1156 psubusb m0, m4 1157 psubusb m1, m4 1158 psubusb m7, m4 1159 psubusb m6, m4 1160 pcmpeqb m0, m3 ; abs(p3-p2) <= I 1161 pcmpeqb m1, m3 ; abs(p2-p1) <= I 1162 pcmpeqb m7, m3 ; abs(q3-q2) <= I 1163 pcmpeqb m6, m3 ; abs(q2-q1) <= I 1164 pand m0, m1 1165 pand m7, m6 1166 pand m0, m7 1167%else ; mmxext/sse2 1168 pmaxub m0, m1 1169 pmaxub m6, m7 1170 pmaxub m0, m6 1171%endif 1172 1173 ; normal_limit and high_edge_variance for p1-p0, q1-q0 1174 SWAP 7, 3 ; now m7 is zero 1175%ifidn %1, v 1176 movrow m3, [dst1q+mstrideq ] ; p0 1177%if mmsize == 16 && %2 == 8 1178 movhps m3, [dst8q+mstrideq ] 1179%endif 1180%elifdef m12 1181 SWAP 3, 12 1182%else 1183 mova m3, m_p0backup 1184%endif 1185 1186 mova m1, m2 1187 SWAP 1, 2 1188 mova m6, m3 1189 SWAP 3, 6 1190 psubusb m1, m3 ; p1-p0 1191 psubusb m6, m2 ; p0-p1 1192 por m1, m6 ; abs(p1-p0) 1193%if notcpuflag(mmxext) 1194 mova m6, m1 1195 psubusb m1, m4 1196 psubusb m6, m_hevthr 1197 pcmpeqb m1, m7 ; abs(p1-p0) <= I 1198 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh 1199 pand m0, m1 1200 mova m_maskres, m6 1201%else ; mmxext/sse2 1202 pmaxub m0, m1 ; max_I 1203 SWAP 1, 4 ; max_hev_thresh 1204%endif 1205 1206 SWAP 6, 4 ; now m6 is I 1207%ifidn %1, v 1208 movrow m4, [dst1q] ; q0 1209%if mmsize == 16 && %2 == 8 1210 movhps m4, [dst8q] 1211%endif 1212%elifdef m8 1213 SWAP 4, 8 1214%else 1215 mova m4, m_q0backup 1216%endif 1217 mova m1, m4 1218 SWAP 1, 4 1219 mova m7, m5 1220 SWAP 7, 5 1221 psubusb m1, m5 ; q0-q1 1222 psubusb m7, m4 ; q1-q0 1223 por m1, m7 ; abs(q1-q0) 1224%if notcpuflag(mmxext) 1225 mova m7, m1 1226 psubusb m1, m6 1227 psubusb m7, m_hevthr 1228 pxor m6, m6 1229 pcmpeqb m1, m6 ; abs(q1-q0) <= I 1230 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh 1231 mova m6, m_maskres 1232 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I 1233 pand m6, m7 1234%else ; mmxext/sse2 1235 pxor m7, m7 1236 pmaxub m0, m1 1237 pmaxub m6, m1 1238 psubusb m0, m_flimI 1239 psubusb m6, m_hevthr 1240 pcmpeqb m0, m7 ; max(abs(..)) <= I 1241 pcmpeqb m6, m7 ; !(max(abs..) > thresh) 1242%endif 1243%ifdef m12 1244 SWAP 6, 12 1245%else 1246 mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) 1247%endif 1248 1249 ; simple_limit 1250 mova m1, m3 1251 SWAP 1, 3 1252 mova m6, m4 ; keep copies of p0/q0 around for later use 1253 SWAP 6, 4 1254 psubusb m1, m4 ; p0-q0 1255 psubusb m6, m3 ; q0-p0 1256 por m1, m6 ; abs(q0-p0) 1257 paddusb m1, m1 ; m1=2*abs(q0-p0) 1258 1259 mova m7, m2 1260 SWAP 7, 2 1261 mova m6, m5 1262 SWAP 6, 5 1263 psubusb m7, m5 ; p1-q1 1264 psubusb m6, m2 ; q1-p1 1265 por m7, m6 ; abs(q1-p1) 1266 pxor m6, m6 1267 pand m7, [pb_FE] 1268 psrlq m7, 1 ; abs(q1-p1)/2 1269 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 1270 psubusb m7, m_flimE 1271 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E 1272 pand m0, m7 ; normal_limit result 1273 1274 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask 1275%ifdef m8 ; x86-64 && sse2 1276 mova m8, [pb_80] 1277%define m_pb_80 m8 1278%else ; x86-32 or mmx/mmxext 1279%define m_pb_80 [pb_80] 1280%endif 1281 mova m1, m4 1282 mova m7, m3 1283 pxor m1, m_pb_80 1284 pxor m7, m_pb_80 1285 psubsb m1, m7 ; (signed) q0-p0 1286 mova m6, m2 1287 mova m7, m5 1288 pxor m6, m_pb_80 1289 pxor m7, m_pb_80 1290 psubsb m6, m7 ; (signed) p1-q1 1291 mova m7, m_maskres 1292 paddsb m6, m1 1293 paddsb m6, m1 1294 paddsb m6, m1 1295 pand m6, m0 1296%ifdef m8 1297 mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge 1298 pand m_limres, m7 1299%else 1300 mova m0, m6 1301 pand m0, m7 1302 mova m_limres, m0 1303%endif 1304 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common 1305 1306 mova m1, [pb_F8] 1307 mova m6, m7 1308 paddsb m7, [pb_3] 1309 paddsb m6, [pb_4] 1310 pand m7, m1 1311 pand m6, m1 1312 1313 pxor m1, m1 1314 pxor m0, m0 1315 pcmpgtb m1, m7 1316 psubb m0, m7 1317 psrlq m7, 3 ; +f2 1318 psrlq m0, 3 ; -f2 1319 pand m0, m1 1320 pandn m1, m7 1321 psubusb m3, m0 1322 paddusb m3, m1 ; p0+f2 1323 1324 pxor m1, m1 1325 pxor m0, m0 1326 pcmpgtb m0, m6 1327 psubb m1, m6 1328 psrlq m6, 3 ; +f1 1329 psrlq m1, 3 ; -f1 1330 pand m1, m0 1331 pandn m0, m6 1332 psubusb m4, m0 1333 paddusb m4, m1 ; q0-f1 1334 1335 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) 1336%if cpuflag(ssse3) 1337 mova m7, [pb_1] 1338%else 1339 mova m7, [pw_63] 1340%endif 1341%ifdef m8 1342 SWAP 1, 8 1343%else 1344 mova m1, m_limres 1345%endif 1346 pxor m0, m0 1347 mova m6, m1 1348 pcmpgtb m0, m1 ; which are negative 1349%if cpuflag(ssse3) 1350 punpcklbw m6, m7 ; interleave with "1" for rounding 1351 punpckhbw m1, m7 1352%else 1353 punpcklbw m6, m0 ; signed byte->word 1354 punpckhbw m1, m0 1355%endif 1356 mova m_limsign, m0 1357%if cpuflag(ssse3) 1358 mova m7, [pb_27_63] 1359%ifndef m8 1360 mova m_limres, m1 1361%endif 1362%ifdef m10 1363 SWAP 0, 10 ; don't lose lim_sign copy 1364%endif 1365 mova m0, m7 1366 pmaddubsw m7, m6 1367 SWAP 6, 7 1368 pmaddubsw m0, m1 1369 SWAP 1, 0 1370%ifdef m10 1371 SWAP 0, 10 1372%else 1373 mova m0, m_limsign 1374%endif 1375%else 1376 mova m_maskres, m6 ; backup for later in filter 1377 mova m_limres, m1 1378 pmullw m6, [pw_27] 1379 pmullw m1, [pw_27] 1380 paddw m6, m7 1381 paddw m1, m7 1382%endif 1383 psraw m6, 7 1384 psraw m1, 7 1385 packsswb m6, m1 ; a0 1386 pxor m1, m1 1387 psubb m1, m6 1388 pand m1, m0 ; -a0 1389 pandn m0, m6 ; +a0 1390%if cpuflag(ssse3) 1391 mova m6, [pb_18_63] ; pipelining 1392%endif 1393 psubusb m3, m1 1394 paddusb m4, m1 1395 paddusb m3, m0 ; p0+a0 1396 psubusb m4, m0 ; q0-a0 1397 1398%if cpuflag(ssse3) 1399 SWAP 6, 7 1400%ifdef m10 1401 SWAP 1, 10 1402%else 1403 mova m1, m_limres 1404%endif 1405 mova m0, m7 1406 pmaddubsw m7, m6 1407 SWAP 6, 7 1408 pmaddubsw m0, m1 1409 SWAP 1, 0 1410%ifdef m10 1411 SWAP 0, 10 1412%endif 1413 mova m0, m_limsign 1414%else 1415 mova m6, m_maskres 1416 mova m1, m_limres 1417 pmullw m6, [pw_18] 1418 pmullw m1, [pw_18] 1419 paddw m6, m7 1420 paddw m1, m7 1421%endif 1422 mova m0, m_limsign 1423 psraw m6, 7 1424 psraw m1, 7 1425 packsswb m6, m1 ; a1 1426 pxor m1, m1 1427 psubb m1, m6 1428 pand m1, m0 ; -a1 1429 pandn m0, m6 ; +a1 1430%if cpuflag(ssse3) 1431 mova m6, [pb_9_63] 1432%endif 1433 psubusb m2, m1 1434 paddusb m5, m1 1435 paddusb m2, m0 ; p1+a1 1436 psubusb m5, m0 ; q1-a1 1437 1438%if cpuflag(ssse3) 1439 SWAP 6, 7 1440%ifdef m10 1441 SWAP 1, 10 1442%else 1443 mova m1, m_limres 1444%endif 1445 mova m0, m7 1446 pmaddubsw m7, m6 1447 SWAP 6, 7 1448 pmaddubsw m0, m1 1449 SWAP 1, 0 1450%else 1451%ifdef m8 1452 SWAP 6, 12 1453 SWAP 1, 8 1454%else 1455 mova m6, m_maskres 1456 mova m1, m_limres 1457%endif 1458 pmullw m6, [pw_9] 1459 pmullw m1, [pw_9] 1460 paddw m6, m7 1461 paddw m1, m7 1462%endif 1463%ifdef m9 1464 SWAP 7, 9 1465%else 1466 mova m7, m_limsign 1467%endif 1468 psraw m6, 7 1469 psraw m1, 7 1470 packsswb m6, m1 ; a1 1471 pxor m0, m0 1472 psubb m0, m6 1473 pand m0, m7 ; -a1 1474 pandn m7, m6 ; +a1 1475%ifdef m8 1476 SWAP 1, 13 1477 SWAP 6, 14 1478%else 1479 mova m1, m_p2backup 1480 mova m6, m_q2backup 1481%endif 1482 psubusb m1, m0 1483 paddusb m6, m0 1484 paddusb m1, m7 ; p1+a1 1485 psubusb m6, m7 ; q1-a1 1486 1487 ; store 1488%ifidn %1, v 1489 movrow [dst2q+mstrideq*4], m1 1490 movrow [dst1q+mstrideq*2], m2 1491 movrow [dst1q+mstrideq ], m3 1492 movrow [dst1q], m4 1493 movrow [dst2q], m5 1494 movrow [dst2q+ strideq ], m6 1495%if mmsize == 16 && %2 == 8 1496 add dst8q, mstrideq 1497 movhps [dst8q+mstrideq*2], m1 1498 movhps [dst8q+mstrideq ], m2 1499 movhps [dst8q], m3 1500 add dst8q, strideq 1501 movhps [dst8q], m4 1502 movhps [dst8q+ strideq ], m5 1503 movhps [dst8q+ strideq*2], m6 1504%endif 1505%else ; h 1506 inc dst1q 1507 inc dst2q 1508 1509 ; 4x8/16 transpose 1510 TRANSPOSE4x4B 1, 2, 3, 4, 0 1511 SBUTTERFLY bw, 5, 6, 0 1512 1513%if mmsize == 8 ; mmx/mmxext (h) 1514 WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq 1515 add dst1q, 4 1516 WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq 1517%else ; sse2 (h) 1518 lea dst8q, [dst8q+mstrideq+1] 1519 WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2 1520 lea dst1q, [dst2q+mstrideq+4] 1521 lea dst8q, [dst8q+mstrideq+4] 1522%if cpuflag(sse4) 1523 add dst2q, 4 1524%endif 1525 WRITE_8W m5, dst2q, dst1q, mstrideq, strideq 1526%if cpuflag(sse4) 1527 lea dst2q, [dst8q+ strideq ] 1528%endif 1529 WRITE_8W m6, dst2q, dst8q, mstrideq, strideq 1530%endif 1531%endif 1532 1533%if mmsize == 8 1534%if %2 == 8 ; chroma 1535%ifidn %1, h 1536 sub dst1q, 5 1537%endif 1538 cmp dst1q, dst8q 1539 mov dst1q, dst8q 1540 jnz .next8px 1541%else 1542%ifidn %1, h 1543 lea dst1q, [dst1q+ strideq*8-5] 1544%else ; v 1545 add dst1q, 8 1546%endif 1547 dec cntrq 1548 jg .next8px 1549%endif 1550 REP_RET 1551%else ; mmsize == 16 1552 RET 1553%endif 1554%endmacro 1555 1556%if ARCH_X86_32 1557INIT_MMX mmx 1558MBEDGE_LOOPFILTER v, 16 1559MBEDGE_LOOPFILTER h, 16 1560MBEDGE_LOOPFILTER v, 8 1561MBEDGE_LOOPFILTER h, 8 1562 1563INIT_MMX mmxext 1564MBEDGE_LOOPFILTER v, 16 1565MBEDGE_LOOPFILTER h, 16 1566MBEDGE_LOOPFILTER v, 8 1567MBEDGE_LOOPFILTER h, 8 1568%endif 1569 1570INIT_XMM sse2 1571MBEDGE_LOOPFILTER v, 16 1572MBEDGE_LOOPFILTER h, 16 1573MBEDGE_LOOPFILTER v, 8 1574MBEDGE_LOOPFILTER h, 8 1575 1576INIT_XMM ssse3 1577MBEDGE_LOOPFILTER v, 16 1578MBEDGE_LOOPFILTER h, 16 1579MBEDGE_LOOPFILTER v, 8 1580MBEDGE_LOOPFILTER h, 8 1581 1582INIT_XMM sse4 1583MBEDGE_LOOPFILTER h, 16 1584MBEDGE_LOOPFILTER h, 8 1585