1;****************************************************************************** 2;* VP9 loop filter SIMD optimizations 3;* 4;* Copyright (C) 2013-2014 Clément Bœsch <u pkh me> 5;* Copyright (C) 2014 Ronald S. Bultje <rsbultje@gmail.com> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27 28cextern pb_3 29cextern pb_80 30 31pb_4: times 16 db 0x04 32pb_10: times 16 db 0x10 33pb_40: times 16 db 0x40 34pb_81: times 16 db 0x81 35pb_f8: times 16 db 0xf8 36pb_fe: times 16 db 0xfe 37pb_ff: times 16 db 0xff 38 39cextern pw_4 40cextern pw_8 41 42; with mix functions, two 8-bit thresholds are stored in a 16-bit storage, 43; the following mask is used to splat both in the same register 44mask_mix: times 8 db 0 45 times 8 db 1 46 47mask_mix84: times 8 db 0xff 48 times 8 db 0x00 49mask_mix48: times 8 db 0x00 50 times 8 db 0xff 51 52SECTION .text 53 54%macro SCRATCH 3 55%ifdef m8 56 SWAP %1, %2 57%else 58 mova [%3], m%1 59%endif 60%endmacro 61 62%macro UNSCRATCH 3 63%ifdef m8 64 SWAP %1, %2 65%else 66 mova m%1, [%3] 67%endif 68%endmacro 69 70; %1 = abs(%2-%3) 71%macro ABSSUB 4 ; dst, src1 (RO), src2 (RO), tmp 72%ifdef m8 73 psubusb %1, %3, %2 74 psubusb %4, %2, %3 75%else 76 mova %1, %3 77 mova %4, %2 78 psubusb %1, %2 79 psubusb %4, %3 80%endif 81 por %1, %4 82%endmacro 83 84; %1 = %1>%2 85%macro CMP_GT 2-3 ; src/dst, cmp, pb_80 86%if %0 == 3 87 pxor %1, %3 88%endif 89 pcmpgtb %1, %2 90%endmacro 91 92; %1 = abs(%2-%3) > %4 93%macro ABSSUB_GT 5-6 [pb_80]; dst, src1, src2, cmp, tmp, [pb_80] 94 ABSSUB %1, %2, %3, %5 ; dst = abs(src1-src2) 95 CMP_GT %1, %4, %6 ; dst > cmp 96%endmacro 97 98%macro MASK_APPLY 4 ; %1=new_data/dst %2=old_data %3=mask %4=tmp 99 pand %1, %3 ; new &= mask 100 pandn %4, %3, %2 ; tmp = ~mask & old 101 por %1, %4 ; new&mask | old&~mask 102%endmacro 103 104%macro UNPACK 4 105%ifdef m8 106 punpck%1bw %2, %3, %4 107%else 108 mova %2, %3 109 punpck%1bw %2, %4 110%endif 111%endmacro 112 113%macro FILTER_SUBx2_ADDx2 11 ; %1=dst %2=h/l %3=cache %4=stack_off %5=sub1 %6=sub2 %7=add1 114 ; %8=add2 %9=rshift, [unpack], [unpack_is_mem_on_x86_32] 115 psubw %3, [rsp+%4+%5*mmsize*2] 116 psubw %3, [rsp+%4+%6*mmsize*2] 117 paddw %3, [rsp+%4+%7*mmsize*2] 118%ifnidn %10, "" 119%if %11 == 0 120 punpck%2bw %1, %10, m0 121%else 122 UNPACK %2, %1, %10, m0 123%endif 124 mova [rsp+%4+%8*mmsize*2], %1 125 paddw %3, %1 126%else 127 paddw %3, [rsp+%4+%8*mmsize*2] 128%endif 129 psraw %1, %3, %9 130%endmacro 131 132; FIXME interleave l/h better (for instruction pairing) 133%macro FILTER_INIT 9 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, filterid, mask, source 134 FILTER%7_INIT %1, l, %3, %6 + 0 135 FILTER%7_INIT %2, h, %4, %6 + mmsize 136 packuswb %1, %2 137 MASK_APPLY %1, %9, %8, %2 138 mova %5, %1 139%endmacro 140 141 142%macro FILTER_UPDATE 12-16 "", "", "", 0 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, -, -, +, +, rshift, 143 ; mask, [source], [unpack + src], [unpack_is_mem_on_x86_32] 144; FIXME interleave this properly with the subx2/addx2 145%ifnidn %15, "" 146%if %16 == 0 || ARCH_X86_64 147 mova %14, %15 148%endif 149%endif 150 FILTER_SUBx2_ADDx2 %1, l, %3, %6 + 0, %7, %8, %9, %10, %11, %14, %16 151 FILTER_SUBx2_ADDx2 %2, h, %4, %6 + mmsize, %7, %8, %9, %10, %11, %14, %16 152 packuswb %1, %2 153%ifnidn %13, "" 154 MASK_APPLY %1, %13, %12, %2 155%else 156 MASK_APPLY %1, %5, %12, %2 157%endif 158 mova %5, %1 159%endmacro 160 161%macro SRSHIFT3B_2X 4 ; reg1, reg2, [pb_10], tmp 162 mova %4, [pb_f8] 163 pand %1, %4 164 pand %2, %4 165 psrlq %1, 3 166 psrlq %2, 3 167 pxor %1, %3 168 pxor %2, %3 169 psubb %1, %3 170 psubb %2, %3 171%endmacro 172 173%macro EXTRACT_POS_NEG 3 ; i8, neg, pos 174 pxor %3, %3 175 pxor %2, %2 176 pcmpgtb %3, %1 ; i8 < 0 mask 177 psubb %2, %1 ; neg values (only the originally - will be kept) 178 pand %2, %3 ; negative values of i8 (but stored as +) 179 pandn %3, %1 ; positive values of i8 180%endmacro 181 182; clip_u8(u8 + i8) 183%macro SIGN_ADD 4 ; dst, u8, i8, tmp1 184 EXTRACT_POS_NEG %3, %4, %1 185 paddusb %1, %2 ; add the positives 186 psubusb %1, %4 ; sub the negatives 187%endmacro 188 189; clip_u8(u8 - i8) 190%macro SIGN_SUB 4 ; dst, u8, i8, tmp1 191 EXTRACT_POS_NEG %3, %1, %4 192 paddusb %1, %2 ; add the negatives 193 psubusb %1, %4 ; sub the positives 194%endmacro 195 196%macro FILTER6_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off 197 UNPACK %2, %1, rp3, m0 ; p3: B->W 198 mova [rsp+%4+0*mmsize*2], %1 199 paddw %3, %1, %1 ; p3*2 200 paddw %3, %1 ; p3*3 201 punpck%2bw %1, m1, m0 ; p2: B->W 202 mova [rsp+%4+1*mmsize*2], %1 203 paddw %3, %1 ; p3*3 + p2 204 paddw %3, %1 ; p3*3 + p2*2 205 UNPACK %2, %1, rp1, m0 ; p1: B->W 206 mova [rsp+%4+2*mmsize*2], %1 207 paddw %3, %1 ; p3*3 + p2*2 + p1 208 UNPACK %2, %1, rp0, m0 ; p0: B->W 209 mova [rsp+%4+3*mmsize*2], %1 210 paddw %3, %1 ; p3*3 + p2*2 + p1 + p0 211 UNPACK %2, %1, rq0, m0 ; q0: B->W 212 mova [rsp+%4+4*mmsize*2], %1 213 paddw %3, %1 ; p3*3 + p2*2 + p1 + p0 + q0 214 paddw %3, [pw_4] ; p3*3 + p2*2 + p1 + p0 + q0 + 4 215 psraw %1, %3, 3 ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3 216%endmacro 217 218%macro FILTER14_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off 219 punpck%2bw %1, m2, m0 ; p7: B->W 220 mova [rsp+%4+ 8*mmsize*2], %1 221 psllw %3, %1, 3 ; p7*8 222 psubw %3, %1 ; p7*7 223 punpck%2bw %1, m3, m0 ; p6: B->W 224 mova [rsp+%4+ 9*mmsize*2], %1 225 paddw %3, %1 ; p7*7 + p6 226 paddw %3, %1 ; p7*7 + p6*2 227 UNPACK %2, %1, rp5, m0 ; p5: B->W 228 mova [rsp+%4+10*mmsize*2], %1 229 paddw %3, %1 ; p7*7 + p6*2 + p5 230 UNPACK %2, %1, rp4, m0 ; p4: B->W 231 mova [rsp+%4+11*mmsize*2], %1 232 paddw %3, %1 ; p7*7 + p6*2 + p5 + p4 233 paddw %3, [rsp+%4+ 0*mmsize*2] ; p7*7 + p6*2 + p5 + p4 + p3 234 paddw %3, [rsp+%4+ 1*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p2 235 paddw %3, [rsp+%4+ 2*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p1 236 paddw %3, [rsp+%4+ 3*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p0 237 paddw %3, [rsp+%4+ 4*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p0 + q0 238 paddw %3, [pw_8] ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8 239 psraw %1, %3, 4 ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4 240%endmacro 241 242%macro TRANSPOSE16x16B 17 243 mova %17, m%16 244 SBUTTERFLY bw, %1, %2, %16 245 SBUTTERFLY bw, %3, %4, %16 246 SBUTTERFLY bw, %5, %6, %16 247 SBUTTERFLY bw, %7, %8, %16 248 SBUTTERFLY bw, %9, %10, %16 249 SBUTTERFLY bw, %11, %12, %16 250 SBUTTERFLY bw, %13, %14, %16 251 mova m%16, %17 252 mova %17, m%14 253 SBUTTERFLY bw, %15, %16, %14 254 SBUTTERFLY wd, %1, %3, %14 255 SBUTTERFLY wd, %2, %4, %14 256 SBUTTERFLY wd, %5, %7, %14 257 SBUTTERFLY wd, %6, %8, %14 258 SBUTTERFLY wd, %9, %11, %14 259 SBUTTERFLY wd, %10, %12, %14 260 SBUTTERFLY wd, %13, %15, %14 261 mova m%14, %17 262 mova %17, m%12 263 SBUTTERFLY wd, %14, %16, %12 264 SBUTTERFLY dq, %1, %5, %12 265 SBUTTERFLY dq, %2, %6, %12 266 SBUTTERFLY dq, %3, %7, %12 267 SBUTTERFLY dq, %4, %8, %12 268 SBUTTERFLY dq, %9, %13, %12 269 SBUTTERFLY dq, %10, %14, %12 270 SBUTTERFLY dq, %11, %15, %12 271 mova m%12, %17 272 mova %17, m%8 273 SBUTTERFLY dq, %12, %16, %8 274 SBUTTERFLY qdq, %1, %9, %8 275 SBUTTERFLY qdq, %2, %10, %8 276 SBUTTERFLY qdq, %3, %11, %8 277 SBUTTERFLY qdq, %4, %12, %8 278 SBUTTERFLY qdq, %5, %13, %8 279 SBUTTERFLY qdq, %6, %14, %8 280 SBUTTERFLY qdq, %7, %15, %8 281 mova m%8, %17 282 mova %17, m%1 283 SBUTTERFLY qdq, %8, %16, %1 284 mova m%1, %17 285 SWAP %2, %9 286 SWAP %3, %5 287 SWAP %4, %13 288 SWAP %6, %11 289 SWAP %8, %15 290 SWAP %12, %14 291%endmacro 292 293%macro TRANSPOSE8x8B 13 294 SBUTTERFLY bw, %1, %2, %7 295 movdq%10 m%7, %9 296 movdqa %11, m%2 297 SBUTTERFLY bw, %3, %4, %2 298 SBUTTERFLY bw, %5, %6, %2 299 SBUTTERFLY bw, %7, %8, %2 300 SBUTTERFLY wd, %1, %3, %2 301 movdqa m%2, %11 302 movdqa %11, m%3 303 SBUTTERFLY wd, %2, %4, %3 304 SBUTTERFLY wd, %5, %7, %3 305 SBUTTERFLY wd, %6, %8, %3 306 SBUTTERFLY dq, %1, %5, %3 307 SBUTTERFLY dq, %2, %6, %3 308 movdqa m%3, %11 309 movh %12, m%2 310 movhps %13, m%2 311 SBUTTERFLY dq, %3, %7, %2 312 SBUTTERFLY dq, %4, %8, %2 313 SWAP %2, %5 314 SWAP %4, %7 315%endmacro 316 317%macro DEFINE_REAL_P7_TO_Q7 0-1 0 318%define P7 dstq + 4*mstrideq + %1 319%define P6 dstq + mstride3q + %1 320%define P5 dstq + 2*mstrideq + %1 321%define P4 dstq + mstrideq + %1 322%define P3 dstq + %1 323%define P2 dstq + strideq + %1 324%define P1 dstq + 2* strideq + %1 325%define P0 dstq + stride3q + %1 326%define Q0 dstq + 4* strideq + %1 327%define Q1 dst2q + mstride3q + %1 328%define Q2 dst2q + 2*mstrideq + %1 329%define Q3 dst2q + mstrideq + %1 330%define Q4 dst2q + %1 331%define Q5 dst2q + strideq + %1 332%define Q6 dst2q + 2* strideq + %1 333%define Q7 dst2q + stride3q + %1 334%endmacro 335 336%macro DEFINE_TRANSPOSED_P7_TO_Q7 0-1 0 337%define P3 rsp + 0*mmsize + %1 338%define P2 rsp + 1*mmsize + %1 339%define P1 rsp + 2*mmsize + %1 340%define P0 rsp + 3*mmsize + %1 341%define Q0 rsp + 4*mmsize + %1 342%define Q1 rsp + 5*mmsize + %1 343%define Q2 rsp + 6*mmsize + %1 344%define Q3 rsp + 7*mmsize + %1 345%if mmsize == 16 346%define P7 rsp + 8*mmsize + %1 347%define P6 rsp + 9*mmsize + %1 348%define P5 rsp + 10*mmsize + %1 349%define P4 rsp + 11*mmsize + %1 350%define Q4 rsp + 12*mmsize + %1 351%define Q5 rsp + 13*mmsize + %1 352%define Q6 rsp + 14*mmsize + %1 353%define Q7 rsp + 15*mmsize + %1 354%endif 355%endmacro 356 357; ..............AB -> AAAAAAAABBBBBBBB 358%macro SPLATB_MIX 1-2 [mask_mix] 359%if cpuflag(ssse3) 360 pshufb %1, %2 361%else 362 punpcklbw %1, %1 363 punpcklwd %1, %1 364 punpckldq %1, %1 365%endif 366%endmacro 367 368%macro LOOPFILTER 5 ; %1=v/h %2=size1 %3+%4=stack, %5=mmx/32bit stack only 369%assign %%ext 0 370%if ARCH_X86_32 || mmsize == 8 371%assign %%ext %5 372%endif 373 374%if UNIX64 375cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 5, 9, 16, %3 + %4 + %%ext, dst, stride, E, I, H, mstride, dst2, stride3, mstride3 376%else 377%if WIN64 378cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 4, 8, 16, %3 + %4 + %%ext, dst, stride, E, I, mstride, dst2, stride3, mstride3 379%else 380cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 2, 6, 16, %3 + %4 + %%ext, dst, stride, mstride, dst2, stride3, mstride3 381%define Ed dword r2m 382%define Id dword r3m 383%endif 384%define Hd dword r4m 385%endif 386 387 mov mstrideq, strideq 388 neg mstrideq 389 390 lea stride3q, [strideq*3] 391 lea mstride3q, [mstrideq*3] 392 393%ifidn %1, h 394%if %2 != 16 395%if mmsize == 16 396%define movx movh 397%else 398%define movx mova 399%endif 400 lea dstq, [dstq + 4*strideq - 4] 401%else 402%define movx movu 403 lea dstq, [dstq + 4*strideq - 8] ; go from top center (h pos) to center left (v pos) 404%endif 405%else 406 lea dstq, [dstq + 4*mstrideq] 407%endif 408 ; FIXME we shouldn't need two dts registers if mmsize == 8 409 lea dst2q, [dstq + 8*strideq] 410 411 DEFINE_REAL_P7_TO_Q7 412 413%ifidn %1, h 414 movx m0, [P7] 415 movx m1, [P6] 416 movx m2, [P5] 417 movx m3, [P4] 418 movx m4, [P3] 419 movx m5, [P2] 420%if (ARCH_X86_64 && mmsize == 16) || %2 > 16 421 movx m6, [P1] 422%endif 423 movx m7, [P0] 424%ifdef m8 425 movx m8, [Q0] 426 movx m9, [Q1] 427 movx m10, [Q2] 428 movx m11, [Q3] 429 movx m12, [Q4] 430 movx m13, [Q5] 431 movx m14, [Q6] 432 movx m15, [Q7] 433 DEFINE_TRANSPOSED_P7_TO_Q7 434%if %2 == 16 435 TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp] 436 mova [P7], m0 437 mova [P6], m1 438 mova [P5], m2 439 mova [P4], m3 440%else ; %2 == 44/48/84/88 441 ; 8x16 transpose 442 punpcklbw m0, m1 443 punpcklbw m2, m3 444 punpcklbw m4, m5 445 punpcklbw m6, m7 446 punpcklbw m8, m9 447 punpcklbw m10, m11 448 punpcklbw m12, m13 449 punpcklbw m14, m15 450 TRANSPOSE8x8W 0, 2, 4, 6, 8, 10, 12, 14, 15 451 SWAP 0, 4 452 SWAP 2, 5 453 SWAP 0, 6 454 SWAP 0, 7 455 SWAP 10, 9 456 SWAP 12, 10 457 SWAP 14, 11 458%endif ; %2 459 mova [P3], m4 460 mova [P2], m5 461 mova [P1], m6 462 mova [P0], m7 463 mova [Q0], m8 464 mova [Q1], m9 465 mova [Q2], m10 466 mova [Q3], m11 467%if %2 == 16 468 mova [Q4], m12 469 mova [Q5], m13 470 mova [Q6], m14 471 mova [Q7], m15 472%endif ; %2 473%else ; x86-32 474%if %2 == 16 475 TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [P1], u, [rsp+%3+%4], [rsp+64], [rsp+80] 476 DEFINE_TRANSPOSED_P7_TO_Q7 477 movh [P7], m0 478 movh [P5], m1 479 movh [P3], m2 480 movh [P1], m3 481 movh [Q2], m5 482 movh [Q4], m6 483 movh [Q6], m7 484 movhps [P6], m0 485 movhps [P4], m1 486 movhps [P2], m2 487 movhps [P0], m3 488 movhps [Q3], m5 489 movhps [Q5], m6 490 movhps [Q7], m7 491 DEFINE_REAL_P7_TO_Q7 492 movx m0, [Q0] 493 movx m1, [Q1] 494 movx m2, [Q2] 495 movx m3, [Q3] 496 movx m4, [Q4] 497 movx m5, [Q5] 498 movx m7, [Q7] 499 TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [Q6], u, [rsp+%3+%4], [rsp+72], [rsp+88] 500 DEFINE_TRANSPOSED_P7_TO_Q7 8 501 movh [P7], m0 502 movh [P5], m1 503 movh [P3], m2 504 movh [P1], m3 505 movh [Q2], m5 506 movh [Q4], m6 507 movh [Q6], m7 508 movhps [P6], m0 509 movhps [P4], m1 510 movhps [P2], m2 511 movhps [P0], m3 512 movhps [Q3], m5 513 movhps [Q5], m6 514 movhps [Q7], m7 515 DEFINE_TRANSPOSED_P7_TO_Q7 516%elif %2 > 16 ; %2 == 44/48/84/88 517 punpcklbw m0, m1 518 punpcklbw m2, m3 519 punpcklbw m4, m5 520 punpcklbw m6, m7 521 movx m1, [Q0] 522 movx m3, [Q1] 523 movx m5, [Q2] 524 movx m7, [Q3] 525 punpcklbw m1, m3 526 punpcklbw m5, m7 527 movx m3, [Q4] 528 movx m7, [Q5] 529 punpcklbw m3, m7 530 mova [rsp], m3 531 movx m3, [Q6] 532 movx m7, [Q7] 533 punpcklbw m3, m7 534 DEFINE_TRANSPOSED_P7_TO_Q7 535 TRANSPOSE8x8W 0, 2, 4, 6, 1, 5, 7, 3, [rsp], [Q0], 1 536 mova [P3], m0 537 mova [P2], m2 538 mova [P1], m4 539 mova [P0], m6 540 mova [Q1], m5 541 mova [Q2], m7 542 mova [Q3], m3 543%else ; %2 == 4 || %2 == 8 544 SBUTTERFLY bw, 0, 1, 6 545 SBUTTERFLY bw, 2, 3, 6 546 SBUTTERFLY bw, 4, 5, 6 547 mova [rsp+4*mmsize], m5 548 mova m6, [P1] 549 SBUTTERFLY bw, 6, 7, 5 550 DEFINE_TRANSPOSED_P7_TO_Q7 551 TRANSPOSE4x4W 0, 2, 4, 6, 5 552 mova [P3], m0 553 mova [P2], m2 554 mova [P1], m4 555 mova [P0], m6 556 mova m5, [rsp+4*mmsize] 557 TRANSPOSE4x4W 1, 3, 5, 7, 0 558 mova [Q0], m1 559 mova [Q1], m3 560 mova [Q2], m5 561 mova [Q3], m7 562%endif ; %2 563%endif ; x86-32/64 564%endif ; %1 == h 565 566 ; calc fm mask 567%if %2 == 16 || mmsize == 8 568%if cpuflag(ssse3) 569 pxor m0, m0 570%endif 571 SPLATB_REG m2, I, m0 ; I I I I ... 572 SPLATB_REG m3, E, m0 ; E E E E ... 573%else 574%if cpuflag(ssse3) 575 mova m0, [mask_mix] 576%endif 577 movd m2, Id 578 movd m3, Ed 579 SPLATB_MIX m2, m0 580 SPLATB_MIX m3, m0 581%endif 582 mova m0, [pb_80] 583 pxor m2, m0 584 pxor m3, m0 585%ifdef m8 586%ifidn %1, v 587 mova m8, [P3] 588 mova m9, [P2] 589 mova m10, [P1] 590 mova m11, [P0] 591 mova m12, [Q0] 592 mova m13, [Q1] 593 mova m14, [Q2] 594 mova m15, [Q3] 595%else 596 ; In case of horizontal, P3..Q3 are already present in some registers due 597 ; to the previous transpose, so we just swap registers. 598 SWAP 8, 4, 12 599 SWAP 9, 5, 13 600 SWAP 10, 6, 14 601 SWAP 11, 7, 15 602%endif 603%define rp3 m8 604%define rp2 m9 605%define rp1 m10 606%define rp0 m11 607%define rq0 m12 608%define rq1 m13 609%define rq2 m14 610%define rq3 m15 611%else 612%define rp3 [P3] 613%define rp2 [P2] 614%define rp1 [P1] 615%define rp0 [P0] 616%define rq0 [Q0] 617%define rq1 [Q1] 618%define rq2 [Q2] 619%define rq3 [Q3] 620%endif 621 ABSSUB_GT m5, rp3, rp2, m2, m7, m0 ; m5 = abs(p3-p2) <= I 622 ABSSUB_GT m1, rp2, rp1, m2, m7, m0 ; m1 = abs(p2-p1) <= I 623 por m5, m1 624 ABSSUB_GT m1, rp1, rp0, m2, m7, m0 ; m1 = abs(p1-p0) <= I 625 por m5, m1 626 ABSSUB_GT m1, rq0, rq1, m2, m7, m0 ; m1 = abs(q1-q0) <= I 627 por m5, m1 628 ABSSUB_GT m1, rq1, rq2, m2, m7, m0 ; m1 = abs(q2-q1) <= I 629 por m5, m1 630 ABSSUB_GT m1, rq2, rq3, m2, m7, m0 ; m1 = abs(q3-q2) <= I 631 por m5, m1 632 ABSSUB m1, rp0, rq0, m7 ; abs(p0-q0) 633 paddusb m1, m1 ; abs(p0-q0) * 2 634 ABSSUB m2, rp1, rq1, m7 ; abs(p1-q1) 635 pand m2, [pb_fe] ; drop lsb so shift can work 636 psrlq m2, 1 ; abs(p1-q1)/2 637 paddusb m1, m2 ; abs(p0-q0)*2 + abs(p1-q1)/2 638 pxor m1, m0 639 pcmpgtb m1, m3 640 por m1, m5 ; fm final value 641 SWAP 1, 3 642 pxor m3, [pb_ff] 643 644 ; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3) 645 ; calc flat8in (if not 44_16) and hev masks 646%if %2 != 44 && %2 != 4 647 mova m6, [pb_81] ; [1 1 1 1 ...] ^ 0x80 648 ABSSUB_GT m2, rp3, rp0, m6, m5 ; abs(p3 - p0) <= 1 649%ifdef m8 650 mova m8, [pb_80] 651%define rb80 m8 652%else 653%define rb80 [pb_80] 654%endif 655 ABSSUB_GT m1, rp2, rp0, m6, m5, rb80 ; abs(p2 - p0) <= 1 656 por m2, m1 657 ABSSUB m4, rp1, rp0, m5 ; abs(p1 - p0) 658%if %2 <= 16 659%if cpuflag(ssse3) 660 pxor m0, m0 661%endif 662 SPLATB_REG m7, H, m0 ; H H H H ... 663%else 664 movd m7, Hd 665 SPLATB_MIX m7 666%endif 667 pxor m7, rb80 668 pxor m4, rb80 669 pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition) 670 CMP_GT m4, m6 ; abs(p1 - p0) <= 1 671 por m2, m4 ; (flat8in) 672 ABSSUB m4, rq1, rq0, m1 ; abs(q1 - q0) 673 pxor m4, rb80 674 pcmpgtb m5, m4, m7 ; abs(q1 - q0) > H (2/2 hev condition) 675 por m0, m5 ; hev final value 676 CMP_GT m4, m6 ; abs(q1 - q0) <= 1 677 por m2, m4 ; (flat8in) 678 ABSSUB_GT m1, rq2, rq0, m6, m5, rb80 ; abs(q2 - q0) <= 1 679 por m2, m1 680 ABSSUB_GT m1, rq3, rq0, m6, m5, rb80 ; abs(q3 - q0) <= 1 681 por m2, m1 ; flat8in final value 682 pxor m2, [pb_ff] 683%if %2 == 84 || %2 == 48 684 pand m2, [mask_mix%2] 685%endif 686%else 687 mova m6, [pb_80] 688%if %2 == 44 689 movd m7, Hd 690 SPLATB_MIX m7 691%else 692%if cpuflag(ssse3) 693 pxor m0, m0 694%endif 695 SPLATB_REG m7, H, m0 ; H H H H ... 696%endif 697 pxor m7, m6 698 ABSSUB m4, rp1, rp0, m1 ; abs(p1 - p0) 699 pxor m4, m6 700 pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition) 701 ABSSUB m4, rq1, rq0, m1 ; abs(q1 - q0) 702 pxor m4, m6 703 pcmpgtb m5, m4, m7 ; abs(q1 - q0) > H (2/2 hev condition) 704 por m0, m5 ; hev final value 705%endif 706 707%if %2 == 16 708 ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3) 709 ; calc flat8out mask 710%ifdef m8 711 mova m8, [P7] 712 mova m9, [P6] 713%define rp7 m8 714%define rp6 m9 715%else 716%define rp7 [P7] 717%define rp6 [P6] 718%endif 719 ABSSUB_GT m1, rp7, rp0, m6, m5 ; abs(p7 - p0) <= 1 720 ABSSUB_GT m7, rp6, rp0, m6, m5 ; abs(p6 - p0) <= 1 721 por m1, m7 722%ifdef m8 723 mova m8, [P5] 724 mova m9, [P4] 725%define rp5 m8 726%define rp4 m9 727%else 728%define rp5 [P5] 729%define rp4 [P4] 730%endif 731 ABSSUB_GT m7, rp5, rp0, m6, m5 ; abs(p5 - p0) <= 1 732 por m1, m7 733 ABSSUB_GT m7, rp4, rp0, m6, m5 ; abs(p4 - p0) <= 1 734 por m1, m7 735%ifdef m8 736 mova m14, [Q4] 737 mova m15, [Q5] 738%define rq4 m14 739%define rq5 m15 740%else 741%define rq4 [Q4] 742%define rq5 [Q5] 743%endif 744 ABSSUB_GT m7, rq4, rq0, m6, m5 ; abs(q4 - q0) <= 1 745 por m1, m7 746 ABSSUB_GT m7, rq5, rq0, m6, m5 ; abs(q5 - q0) <= 1 747 por m1, m7 748%ifdef m8 749 mova m14, [Q6] 750 mova m15, [Q7] 751%define rq6 m14 752%define rq7 m15 753%else 754%define rq6 [Q6] 755%define rq7 [Q7] 756%endif 757 ABSSUB_GT m7, rq6, rq0, m6, m5 ; abs(q4 - q0) <= 1 758 por m1, m7 759 ABSSUB_GT m7, rq7, rq0, m6, m5 ; abs(q5 - q0) <= 1 760 por m1, m7 ; flat8out final value 761 pxor m1, [pb_ff] 762%endif 763 764 ; if (fm) { 765 ; if (out && in) filter_14() 766 ; else if (in) filter_6() 767 ; else if (hev) filter_2() 768 ; else filter_4() 769 ; } 770 ; 771 ; f14: fm & out & in 772 ; f6: fm & ~f14 & in => fm & ~(out & in) & in => fm & ~out & in 773 ; f2: fm & ~f14 & ~f6 & hev => fm & ~(out & in) & ~(~out & in) & hev => fm & ~in & hev 774 ; f4: fm & ~f14 & ~f6 & ~f2 => fm & ~(out & in) & ~(~out & in) & ~(~in & hev) => fm & ~in & ~hev 775 776 ; (m0: hev, [m1: flat8out], [m2: flat8in], m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7) 777 ; filter2() 778%if %2 != 44 && %2 != 4 779 mova m6, [pb_80] ; already in m6 if 44_16 780 SCRATCH 2, 15, rsp+%3+%4 781%if %2 == 16 782 SCRATCH 1, 8, rsp+%3+%4+16 783%endif 784%endif 785 pxor m2, m6, rq0 ; q0 ^ 0x80 786 pxor m4, m6, rp0 ; p0 ^ 0x80 787 psubsb m2, m4 ; (signed) q0 - p0 788 pxor m4, m6, rp1 ; p1 ^ 0x80 789 pxor m5, m6, rq1 ; q1 ^ 0x80 790 psubsb m4, m5 ; (signed) p1 - q1 791 paddsb m4, m2 ; (q0 - p0) + (p1 - q1) 792 paddsb m4, m2 ; 2*(q0 - p0) + (p1 - q1) 793 paddsb m4, m2 ; 3*(q0 - p0) + (p1 - q1) 794 paddsb m6, m4, [pb_4] ; m6: f1 = clip(f + 4, 127) 795 paddsb m4, [pb_3] ; m4: f2 = clip(f + 3, 127) 796%ifdef m8 797 mova m14, [pb_10] ; will be reused in filter4() 798%define rb10 m14 799%else 800%define rb10 [pb_10] 801%endif 802 SRSHIFT3B_2X m6, m4, rb10, m7 ; f1 and f2 sign byte shift by 3 803 SIGN_SUB m7, rq0, m6, m5 ; m7 = q0 - f1 804 SIGN_ADD m1, rp0, m4, m5 ; m1 = p0 + f2 805%if %2 != 44 && %2 != 4 806%ifdef m8 807 pandn m6, m15, m3 ; ~mask(in) & mask(fm) 808%else 809 mova m6, [rsp+%3+%4] 810 pandn m6, m3 811%endif 812 pand m6, m0 ; (~mask(in) & mask(fm)) & mask(hev) 813%else 814 pand m6, m3, m0 815%endif 816 MASK_APPLY m7, rq0, m6, m5 ; m7 = filter2(q0) & mask / we write it in filter4() 817 MASK_APPLY m1, rp0, m6, m5 ; m1 = filter2(p0) & mask / we write it in filter4() 818 819 ; (m0: hev, m1: p0', m2: q0-p0, m3: fm, m7: q0', [m8: flat8out], m10..13: p1 p0 q0 q1, m14: pb_10, [m15: flat8in], ) 820 ; filter4() 821 mova m4, m2 822 paddsb m2, m4 ; 2 * (q0 - p0) 823 paddsb m2, m4 ; 3 * (q0 - p0) 824 paddsb m6, m2, [pb_4] ; m6: f1 = clip(f + 4, 127) 825 paddsb m2, [pb_3] ; m2: f2 = clip(f + 3, 127) 826 SRSHIFT3B_2X m6, m2, rb10, m4 ; f1 and f2 sign byte shift by 3 827%if %2 != 44 && %2 != 4 828%ifdef m8 829 pandn m5, m15, m3 ; ~mask(in) & mask(fm) 830%else 831 mova m5, [rsp+%3+%4] 832 pandn m5, m3 833%endif 834 pandn m0, m5 ; ~mask(hev) & (~mask(in) & mask(fm)) 835%else 836 pandn m0, m3 837%endif 838 SIGN_SUB m5, rq0, m6, m4 ; q0 - f1 839 MASK_APPLY m5, m7, m0, m4 ; filter4(q0) & mask 840 mova [Q0], m5 841 SIGN_ADD m7, rp0, m2, m4 ; p0 + f2 842 MASK_APPLY m7, m1, m0, m4 ; filter4(p0) & mask 843 mova [P0], m7 844 paddb m6, [pb_80] ; 845 pxor m1, m1 ; f=(f1+1)>>1 846 pavgb m6, m1 ; 847 psubb m6, [pb_40] ; 848 SIGN_ADD m1, rp1, m6, m2 ; p1 + f 849 SIGN_SUB m4, rq1, m6, m2 ; q1 - f 850 MASK_APPLY m1, rp1, m0, m2 ; m1 = filter4(p1) 851 MASK_APPLY m4, rq1, m0, m2 ; m4 = filter4(q1) 852 mova [P1], m1 853 mova [Q1], m4 854 855%if %2 != 44 && %2 != 4 856 UNSCRATCH 2, 15, rsp+%3+%4 857%endif 858 859 ; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1) 860 ; filter6() 861%if %2 != 44 && %2 != 4 862 pxor m0, m0 863%if %2 != 16 864 pand m3, m2 865%else 866 pand m2, m3 ; mask(fm) & mask(in) 867%ifdef m8 868 pandn m3, m8, m2 ; ~mask(out) & (mask(fm) & mask(in)) 869%else 870 mova m3, [rsp+%3+%4+16] 871 pandn m3, m2 872%endif 873%endif 874%ifdef m8 875 mova m14, [P3] 876 mova m9, [Q3] 877%define rp3 m14 878%define rq3 m9 879%else 880%define rp3 [P3] 881%define rq3 [Q3] 882%endif 883 mova m1, [P2] 884 FILTER_INIT m4, m5, m6, m7, [P2], %4, 6, m3, m1 ; [p2] 885 mova m1, [Q2] 886 FILTER_UPDATE m4, m5, m6, m7, [P1], %4, 0, 1, 2, 5, 3, m3, "", rq1, "", 1 ; [p1] -p3 -p2 +p1 +q1 887 FILTER_UPDATE m4, m5, m6, m7, [P0], %4, 0, 2, 3, 6, 3, m3, "", m1 ; [p0] -p3 -p1 +p0 +q2 888 FILTER_UPDATE m4, m5, m6, m7, [Q0], %4, 0, 3, 4, 7, 3, m3, "", rq3, "", 1 ; [q0] -p3 -p0 +q0 +q3 889 FILTER_UPDATE m4, m5, m6, m7, [Q1], %4, 1, 4, 5, 7, 3, m3, "" ; [q1] -p2 -q0 +q1 +q3 890 FILTER_UPDATE m4, m5, m6, m7, [Q2], %4, 2, 5, 6, 7, 3, m3, m1 ; [q2] -p1 -q1 +q2 +q3 891%endif 892 893%if %2 == 16 894 UNSCRATCH 1, 8, rsp+%3+%4+16 895%endif 896 897 ; (m0: 0, [m1: flat8out], m2: fm & flat8in, m8..15: q2 q3 p1 p0 q0 q1 p3 p2) 898 ; filter14() 899 ; 900 ; m2 m3 m8 m9 m14 m15 m10 m11 m12 m13 901 ; 902 ; q2 q3 p3 p2 p1 p0 q0 q1 903 ; p6 -7 p7 p6 p5 p4 . . . . . 904 ; p5 -6 -p7 -p6 +p5 +q1 . . . . 905 ; p4 -5 -p7 -p5 +p4 +q2 . . . q2 906 ; p3 -4 -p7 -p4 +p3 +q3 . . . q3 907 ; p2 -3 -p7 -p3 +p2 +q4 . . . q4 908 ; p1 -2 -p7 -p2 +p1 +q5 . . . q5 909 ; p0 -1 -p7 -p1 +p0 +q6 . . . q6 910 ; q0 +0 -p7 -p0 +q0 +q7 . . . q7 911 ; q1 +1 -p6 -q0 +q1 +q7 q1 . . . 912 ; q2 +2 -p5 -q1 +q2 +q7 . q2 . . 913 ; q3 +3 -p4 -q2 +q3 +q7 . q3 . . 914 ; q4 +4 -p3 -q3 +q4 +q7 . q4 . . 915 ; q5 +5 -p2 -q4 +q5 +q7 . q5 . . 916 ; q6 +6 -p1 -q5 +q6 +q7 . q6 . . 917 918%if %2 == 16 919 pand m1, m2 ; mask(out) & (mask(fm) & mask(in)) 920 mova m2, [P7] 921 mova m3, [P6] 922%ifdef m8 923 mova m8, [P5] 924 mova m9, [P4] 925%define rp5 m8 926%define rp4 m9 927%define rp5s m8 928%define rp4s m9 929%define rp3s m14 930%define rq4 m8 931%define rq5 m9 932%define rq6 m14 933%define rq7 m15 934%define rq4s m8 935%define rq5s m9 936%define rq6s m14 937%else 938%define rp5 [P5] 939%define rp4 [P4] 940%define rp5s "" 941%define rp4s "" 942%define rp3s "" 943%define rq4 [Q4] 944%define rq5 [Q5] 945%define rq6 [Q6] 946%define rq7 [Q7] 947%define rq4s "" 948%define rq5s "" 949%define rq6s "" 950%endif 951 FILTER_INIT m4, m5, m6, m7, [P6], %4, 14, m1, m3 ; [p6] 952 FILTER_UPDATE m4, m5, m6, m7, [P5], %4, 8, 9, 10, 5, 4, m1, rp5s ; [p5] -p7 -p6 +p5 +q1 953 FILTER_UPDATE m4, m5, m6, m7, [P4], %4, 8, 10, 11, 6, 4, m1, rp4s ; [p4] -p7 -p5 +p4 +q2 954 FILTER_UPDATE m4, m5, m6, m7, [P3], %4, 8, 11, 0, 7, 4, m1, rp3s ; [p3] -p7 -p4 +p3 +q3 955 FILTER_UPDATE m4, m5, m6, m7, [P2], %4, 8, 0, 1, 12, 4, m1, "", rq4, [Q4], 1 ; [p2] -p7 -p3 +p2 +q4 956 FILTER_UPDATE m4, m5, m6, m7, [P1], %4, 8, 1, 2, 13, 4, m1, "", rq5, [Q5], 1 ; [p1] -p7 -p2 +p1 +q5 957 FILTER_UPDATE m4, m5, m6, m7, [P0], %4, 8, 2, 3, 14, 4, m1, "", rq6, [Q6], 1 ; [p0] -p7 -p1 +p0 +q6 958 FILTER_UPDATE m4, m5, m6, m7, [Q0], %4, 8, 3, 4, 15, 4, m1, "", rq7, [Q7], 1 ; [q0] -p7 -p0 +q0 +q7 959 FILTER_UPDATE m4, m5, m6, m7, [Q1], %4, 9, 4, 5, 15, 4, m1, "" ; [q1] -p6 -q0 +q1 +q7 960 FILTER_UPDATE m4, m5, m6, m7, [Q2], %4, 10, 5, 6, 15, 4, m1, "" ; [q2] -p5 -q1 +q2 +q7 961 FILTER_UPDATE m4, m5, m6, m7, [Q3], %4, 11, 6, 7, 15, 4, m1, "" ; [q3] -p4 -q2 +q3 +q7 962 FILTER_UPDATE m4, m5, m6, m7, [Q4], %4, 0, 7, 12, 15, 4, m1, rq4s ; [q4] -p3 -q3 +q4 +q7 963 FILTER_UPDATE m4, m5, m6, m7, [Q5], %4, 1, 12, 13, 15, 4, m1, rq5s ; [q5] -p2 -q4 +q5 +q7 964 FILTER_UPDATE m4, m5, m6, m7, [Q6], %4, 2, 13, 14, 15, 4, m1, rq6s ; [q6] -p1 -q5 +q6 +q7 965%endif 966 967%ifidn %1, h 968%if %2 == 16 969 mova m0, [P7] 970 mova m1, [P6] 971 mova m2, [P5] 972 mova m3, [P4] 973 mova m4, [P3] 974 mova m5, [P2] 975%if ARCH_X86_64 976 mova m6, [P1] 977%endif 978 mova m7, [P0] 979%if ARCH_X86_64 980 mova m8, [Q0] 981 mova m9, [Q1] 982 mova m10, [Q2] 983 mova m11, [Q3] 984 mova m12, [Q4] 985 mova m13, [Q5] 986 mova m14, [Q6] 987 mova m15, [Q7] 988 TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp] 989 DEFINE_REAL_P7_TO_Q7 990 movu [P7], m0 991 movu [P6], m1 992 movu [P5], m2 993 movu [P4], m3 994 movu [P3], m4 995 movu [P2], m5 996 movu [P1], m6 997 movu [P0], m7 998 movu [Q0], m8 999 movu [Q1], m9 1000 movu [Q2], m10 1001 movu [Q3], m11 1002 movu [Q4], m12 1003 movu [Q5], m13 1004 movu [Q6], m14 1005 movu [Q7], m15 1006%else 1007 DEFINE_REAL_P7_TO_Q7 1008 TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+32], a, [rsp+%3+%4], [Q0], [Q1] 1009 movh [P7], m0 1010 movh [P5], m1 1011 movh [P3], m2 1012 movh [P1], m3 1013 movh [Q2], m5 1014 movh [Q4], m6 1015 movh [Q6], m7 1016 movhps [P6], m0 1017 movhps [P4], m1 1018 movhps [P2], m2 1019 movhps [P0], m3 1020 movhps [Q3], m5 1021 movhps [Q5], m6 1022 movhps [Q7], m7 1023 DEFINE_TRANSPOSED_P7_TO_Q7 1024 mova m0, [Q0] 1025 mova m1, [Q1] 1026 mova m2, [Q2] 1027 mova m3, [Q3] 1028 mova m4, [Q4] 1029 mova m5, [Q5] 1030 mova m7, [Q7] 1031 DEFINE_REAL_P7_TO_Q7 8 1032 TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+224], a, [rsp+%3+%4], [Q0], [Q1] 1033 movh [P7], m0 1034 movh [P5], m1 1035 movh [P3], m2 1036 movh [P1], m3 1037 movh [Q2], m5 1038 movh [Q4], m6 1039 movh [Q6], m7 1040 movhps [P6], m0 1041 movhps [P4], m1 1042 movhps [P2], m2 1043 movhps [P0], m3 1044 movhps [Q3], m5 1045 movhps [Q5], m6 1046 movhps [Q7], m7 1047%endif 1048%elif %2 == 44 || %2 == 4 1049 SWAP 0, 1 ; m0 = p1 1050 SWAP 1, 7 ; m1 = p0 1051 SWAP 2, 5 ; m2 = q0 1052 SWAP 3, 4 ; m3 = q1 1053 DEFINE_REAL_P7_TO_Q7 2 1054 SBUTTERFLY bw, 0, 1, 4 1055 SBUTTERFLY bw, 2, 3, 4 1056 SBUTTERFLY wd, 0, 2, 4 1057 SBUTTERFLY wd, 1, 3, 4 1058%if mmsize == 16 1059 movd [P7], m0 1060 movd [P3], m2 1061 movd [Q0], m1 1062 movd [Q4], m3 1063 psrldq m0, 4 1064 psrldq m1, 4 1065 psrldq m2, 4 1066 psrldq m3, 4 1067 movd [P6], m0 1068 movd [P2], m2 1069 movd [Q1], m1 1070 movd [Q5], m3 1071 psrldq m0, 4 1072 psrldq m1, 4 1073 psrldq m2, 4 1074 psrldq m3, 4 1075 movd [P5], m0 1076 movd [P1], m2 1077 movd [Q2], m1 1078 movd [Q6], m3 1079 psrldq m0, 4 1080 psrldq m1, 4 1081 psrldq m2, 4 1082 psrldq m3, 4 1083 movd [P4], m0 1084 movd [P0], m2 1085 movd [Q3], m1 1086 movd [Q7], m3 1087%else 1088 movd [P7], m0 1089 movd [P5], m2 1090 movd [P3], m1 1091 movd [P1], m3 1092 psrlq m0, 32 1093 psrlq m2, 32 1094 psrlq m1, 32 1095 psrlq m3, 32 1096 movd [P6], m0 1097 movd [P4], m2 1098 movd [P2], m1 1099 movd [P0], m3 1100%endif 1101%else 1102 ; the following code do a transpose of 8 full lines to 16 half 1103 ; lines (high part). It is inlined to avoid the need of a staging area 1104 mova m0, [P3] 1105 mova m1, [P2] 1106 mova m2, [P1] 1107 mova m3, [P0] 1108 mova m4, [Q0] 1109 mova m5, [Q1] 1110%ifdef m8 1111 mova m6, [Q2] 1112%endif 1113 mova m7, [Q3] 1114 DEFINE_REAL_P7_TO_Q7 1115%ifdef m8 1116 SBUTTERFLY bw, 0, 1, 8 1117 SBUTTERFLY bw, 2, 3, 8 1118 SBUTTERFLY bw, 4, 5, 8 1119 SBUTTERFLY bw, 6, 7, 8 1120 SBUTTERFLY wd, 0, 2, 8 1121 SBUTTERFLY wd, 1, 3, 8 1122 SBUTTERFLY wd, 4, 6, 8 1123 SBUTTERFLY wd, 5, 7, 8 1124 SBUTTERFLY dq, 0, 4, 8 1125 SBUTTERFLY dq, 1, 5, 8 1126 SBUTTERFLY dq, 2, 6, 8 1127 SBUTTERFLY dq, 3, 7, 8 1128%else 1129 SBUTTERFLY bw, 0, 1, 6 1130 mova [rsp+mmsize*4], m1 1131 mova m6, [rsp+mmsize*6] 1132 SBUTTERFLY bw, 2, 3, 1 1133 SBUTTERFLY bw, 4, 5, 1 1134 SBUTTERFLY bw, 6, 7, 1 1135 SBUTTERFLY wd, 0, 2, 1 1136 mova [rsp+mmsize*6], m2 1137 mova m1, [rsp+mmsize*4] 1138 SBUTTERFLY wd, 1, 3, 2 1139 SBUTTERFLY wd, 4, 6, 2 1140 SBUTTERFLY wd, 5, 7, 2 1141 SBUTTERFLY dq, 0, 4, 2 1142 SBUTTERFLY dq, 1, 5, 2 1143%if mmsize == 16 1144 movh [Q0], m1 1145 movhps [Q1], m1 1146%else 1147 mova [P3], m1 1148%endif 1149 mova m2, [rsp+mmsize*6] 1150 SBUTTERFLY dq, 2, 6, 1 1151 SBUTTERFLY dq, 3, 7, 1 1152%endif 1153 SWAP 3, 6 1154 SWAP 1, 4 1155%if mmsize == 16 1156 movh [P7], m0 1157 movhps [P6], m0 1158 movh [P5], m1 1159 movhps [P4], m1 1160 movh [P3], m2 1161 movhps [P2], m2 1162 movh [P1], m3 1163 movhps [P0], m3 1164%ifdef m8 1165 movh [Q0], m4 1166 movhps [Q1], m4 1167%endif 1168 movh [Q2], m5 1169 movhps [Q3], m5 1170 movh [Q4], m6 1171 movhps [Q5], m6 1172 movh [Q6], m7 1173 movhps [Q7], m7 1174%else 1175 mova [P7], m0 1176 mova [P6], m1 1177 mova [P5], m2 1178 mova [P4], m3 1179 mova [P2], m5 1180 mova [P1], m6 1181 mova [P0], m7 1182%endif 1183%endif 1184%endif 1185 1186 RET 1187%endmacro 1188 1189%macro LPF_16_VH 5 1190INIT_XMM %5 1191LOOPFILTER v, %1, %2, 0, %4 1192LOOPFILTER h, %1, %2, %3, %4 1193%endmacro 1194 1195%macro LPF_16_VH_ALL_OPTS 4 1196LPF_16_VH %1, %2, %3, %4, sse2 1197LPF_16_VH %1, %2, %3, %4, ssse3 1198LPF_16_VH %1, %2, %3, %4, avx 1199%endmacro 1200 1201LPF_16_VH_ALL_OPTS 16, 512, 256, 32 1202LPF_16_VH_ALL_OPTS 44, 0, 128, 0 1203LPF_16_VH_ALL_OPTS 48, 256, 128, 16 1204LPF_16_VH_ALL_OPTS 84, 256, 128, 16 1205LPF_16_VH_ALL_OPTS 88, 256, 128, 16 1206 1207INIT_MMX mmxext 1208LOOPFILTER v, 4, 0, 0, 0 1209LOOPFILTER h, 4, 0, 64, 0 1210LOOPFILTER v, 8, 128, 0, 8 1211LOOPFILTER h, 8, 128, 64, 8 1212