1;****************************************************************************** 2;* 36 point SSE-optimized IMDCT transform 3;* Copyright (c) 2011 Vitor Sessak 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 25 26align 16 27ps_mask: dd 0, ~0, ~0, ~0 28ps_mask2: dd 0, ~0, 0, ~0 29ps_mask3: dd 0, 0, 0, ~0 30ps_mask4: dd 0, ~0, 0, 0 31 32ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038 33ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038 34ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433 35ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038 36ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530 37ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097 38ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097 39 40ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 41ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 42 43ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461 44 dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349 45 dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896 46 dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991 47 dd 1.0, 0.70710678118654752439, 0.0, 0.0 48 49ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461 50 dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349 51 dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896 52 dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991 53 dd 1.0, -0.70710678118654752439, 0.0, 0.0 54 55costabs: times 4 dd 0.98480773 56 times 4 dd 0.93969262 57 times 4 dd 0.86602539 58 times 4 dd -0.76604444 59 times 4 dd -0.64278764 60 times 4 dd 0.50000000 61 times 4 dd -0.50000000 62 times 4 dd -0.34202015 63 times 4 dd -0.17364818 64 times 4 dd 0.50190992 65 times 4 dd 0.51763808 66 times 4 dd 0.55168896 67 times 4 dd 0.61038726 68 times 4 dd 0.70710677 69 times 4 dd 0.87172341 70 times 4 dd 1.18310082 71 times 4 dd 1.93185163 72 times 4 dd 5.73685646 73 74%define SBLIMIT 32 75SECTION_TEXT 76 77%macro PSHUFD 3 78%if cpuflag(sse2) && notcpuflag(avx) 79 pshufd %1, %2, %3 80%else 81 shufps %1, %2, %2, %3 82%endif 83%endmacro 84 85; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} 86; output %1={x3,x4,y1,y2} 87%macro BUILDINVHIGHLOW 3 88%if cpuflag(avx) 89 shufps %1, %2, %3, 0x4e 90%else 91 movlhps %1, %3 92 movhlps %1, %2 93%endif 94%endmacro 95 96; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} 97; output %1={x4,y1,y2,y3} 98%macro ROTLEFT 3 99%if cpuflag(ssse3) 100 palignr %1, %3, %2, 12 101%else 102 BUILDINVHIGHLOW %1, %2, %3 103 shufps %1, %1, %3, 0x99 104%endif 105%endmacro 106 107%macro INVERTHL 2 108%if cpuflag(sse2) 109 PSHUFD %1, %2, 0x4e 110%else 111 movhlps %1, %2 112 movlhps %1, %2 113%endif 114%endmacro 115 116%macro BUTTERF 3 117 INVERTHL %2, %1 118 xorps %1, [ps_p1p1m1m1] 119 addps %1, %2 120%if cpuflag(sse3) 121 mulps %1, %1, [ps_cosh_sse3 + %3] 122 PSHUFD %2, %1, 0xb1 123 addsubps %1, %1, %2 124%else 125 mulps %1, [ps_cosh + %3] 126 PSHUFD %2, %1, 0xb1 127 xorps %1, [ps_p1m1p1m1] 128 addps %1, %2 129%endif 130%endmacro 131 132%macro BUTTERF2 3 133%if cpuflag(sse3) 134 mulps %1, %1, [ps_cosh_sse3 + %3] 135 PSHUFD %2, %1, 0xe1 136 addsubps %1, %1, %2 137%else 138 mulps %1, [ps_cosh + %3] 139 PSHUFD %2, %1, 0xe1 140 xorps %1, [ps_p1m1p1m1] 141 addps %1, %2 142%endif 143%endmacro 144 145%macro STORE 4 146 movhlps %2, %1 147 movss [%3 ], %1 148 movss [%3 + 2*%4], %2 149 shufps %1, %1, 0xb1 150 movss [%3 + %4], %1 151 movhlps %2, %1 152 movss [%3 + 3*%4], %2 153%endmacro 154 155%macro LOAD 4 156 movlps %1, [%3 ] 157 movhps %1, [%3 + %4] 158 movlps %2, [%3 + 2*%4] 159 movhps %2, [%3 + 3*%4] 160 shufps %1, %2, 0x88 161%endmacro 162 163%macro LOADA64 2 164%if cpuflag(avx) 165 movu %1, [%2] 166%else 167 movlps %1, [%2] 168 movhps %1, [%2 + 8] 169%endif 170%endmacro 171 172%macro DEFINE_IMDCT 0 173cglobal imdct36_float, 4,4,9, out, buf, in, win 174 175 ; for(i=17;i>=1;i--) in[i] += in[i-1]; 176 LOADA64 m0, inq 177 LOADA64 m1, inq + 16 178 179 ROTLEFT m5, m0, m1 180 181 PSHUFD m6, m0, 0x93 182 andps m6, m6, [ps_mask] 183 addps m0, m0, m6 184 185 LOADA64 m2, inq + 32 186 187 ROTLEFT m7, m1, m2 188 189 addps m1, m1, m5 190 LOADA64 m3, inq + 48 191 192 ROTLEFT m5, m2, m3 193 194 xorps m4, m4, m4 195 movlps m4, [inq+64] 196 BUILDINVHIGHLOW m6, m3, m4 197 shufps m6, m6, m4, 0xa9 198 199 addps m4, m4, m6 200 addps m2, m2, m7 201 addps m3, m3, m5 202 203 ; for(i=17;i>=3;i-=2) in[i] += in[i-2]; 204 movlhps m5, m5, m0 205 andps m5, m5, [ps_mask3] 206 207 BUILDINVHIGHLOW m7, m0, m1 208 andps m7, m7, [ps_mask2] 209 210 addps m0, m0, m5 211 212 BUILDINVHIGHLOW m6, m1, m2 213 andps m6, m6, [ps_mask2] 214 215 addps m1, m1, m7 216 217 BUILDINVHIGHLOW m7, m2, m3 218 andps m7, m7, [ps_mask2] 219 220 addps m2, m2, m6 221 222 movhlps m6, m6, m3 223 andps m6, m6, [ps_mask4] 224 225 addps m3, m3, m7 226 addps m4, m4, m6 227 228 ; Populate tmp[] 229 movlhps m6, m1, m5 ; zero out high values 230 subps m6, m6, m4 231 232 subps m5, m0, m3 233 234%if ARCH_X86_64 235 SWAP m5, m8 236%endif 237 238 mulps m7, m2, [ps_val1] 239 240%if ARCH_X86_64 241 mulps m5, m8, [ps_val2] 242%else 243 mulps m5, m5, [ps_val2] 244%endif 245 addps m7, m7, m5 246 247 mulps m5, m6, [ps_val1] 248 subps m7, m7, m5 249 250%if ARCH_X86_64 251 SWAP m5, m8 252%else 253 subps m5, m0, m3 254%endif 255 256 subps m5, m5, m6 257 addps m5, m5, m2 258 259 shufps m6, m4, m3, 0xe4 260 subps m6, m6, m2 261 mulps m6, m6, [ps_val3] 262 263 addps m4, m4, m1 264 mulps m4, m4, [ps_val4] 265 266 shufps m1, m1, m0, 0xe4 267 addps m1, m1, m2 268 mulps m1, m1, [ps_val5] 269 270 mulps m3, m3, [ps_val6] 271 mulps m0, m0, [ps_val7] 272 addps m0, m0, m3 273 274 xorps m2, m1, [ps_p1p1m1m1] 275 subps m2, m2, m4 276 addps m2, m2, m0 277 278 addps m3, m4, m0 279 subps m3, m3, m6 280 xorps m3, m3, [ps_p1p1m1m1] 281 282 shufps m0, m0, m4, 0xe4 283 subps m0, m0, m1 284 addps m0, m0, m6 285 286 BUILDINVHIGHLOW m4, m2, m3 287 shufps m3, m3, m2, 0x4e 288 289 ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5} 290 291 BUTTERF m0, m1, 0 292 BUTTERF m7, m2, 16 293 BUTTERF m3, m6, 32 294 BUTTERF m4, m1, 48 295 BUTTERF2 m5, m1, 64 296 297 ; permutates: 298 ; m0 0 1 2 3 => 2 6 10 14 m1 299 ; m7 4 5 6 7 => 3 7 11 15 m2 300 ; m3 8 9 10 11 => 17 13 9 5 m3 301 ; m4 12 13 14 15 => 16 12 8 4 m5 302 ; m5 16 17 xx xx => 0 1 xx xx m0 303 304 unpckhps m1, m0, m7 305 unpckhps m6, m3, m4 306 movhlps m2, m6, m1 307 movlhps m1, m1, m6 308 309 unpcklps m5, m5, m4 310 unpcklps m3, m3, m7 311 movhlps m4, m3, m5 312 movlhps m5, m5, m3 313 SWAP m4, m3 314 ; permutation done 315 316 PSHUFD m6, m2, 0xb1 317 movss m4, [bufq + 4*68] 318 movss m7, [bufq + 4*64] 319 unpcklps m7, m7, m4 320 mulps m6, m6, [winq + 16*4] 321 addps m6, m6, m7 322 movss [outq + 64*SBLIMIT], m6 323 shufps m6, m6, m6, 0xb1 324 movss [outq + 68*SBLIMIT], m6 325 326 mulps m6, m3, [winq + 4*4] 327 LOAD m4, m7, bufq + 4*16, 16 328 addps m6, m6, m4 329 STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT 330 331 shufps m4, m0, m3, 0xb5 332 mulps m4, m4, [winq + 8*4] 333 LOAD m7, m6, bufq + 4*32, 16 334 addps m4, m4, m7 335 STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT 336 337 shufps m3, m3, m2, 0xb1 338 mulps m3, m3, [winq + 12*4] 339 LOAD m7, m6, bufq + 4*48, 16 340 addps m3, m3, m7 341 STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT 342 343 mulps m2, m2, [winq] 344 LOAD m6, m7, bufq, 16 345 addps m2, m2, m6 346 STORE m2, m7, outq, 4*SBLIMIT 347 348 mulps m4, m1, [winq + 20*4] 349 STORE m4, m7, bufq, 16 350 351 mulps m3, m5, [winq + 24*4] 352 STORE m3, m7, bufq + 4*16, 16 353 354 shufps m0, m0, m5, 0xb0 355 mulps m0, m0, [winq + 28*4] 356 STORE m0, m7, bufq + 4*32, 16 357 358 shufps m5, m5, m1, 0xb1 359 mulps m5, m5, [winq + 32*4] 360 STORE m5, m7, bufq + 4*48, 16 361 362 shufps m1, m1, m1, 0xb1 363 mulps m1, m1, [winq + 36*4] 364 movss [bufq + 4*64], m1 365 shufps m1, m1, 0xb1 366 movss [bufq + 4*68], m1 367 RET 368%endmacro 369 370%if ARCH_X86_32 371INIT_XMM sse 372DEFINE_IMDCT 373%endif 374 375INIT_XMM sse2 376DEFINE_IMDCT 377 378INIT_XMM sse3 379DEFINE_IMDCT 380 381INIT_XMM ssse3 382DEFINE_IMDCT 383 384%if HAVE_AVX_EXTERNAL 385INIT_XMM avx 386DEFINE_IMDCT 387%endif 388 389INIT_XMM sse 390 391%if ARCH_X86_64 392%define SPILL SWAP 393%define UNSPILL SWAP 394%define SPILLED(x) m %+ x 395%else 396%define SPILLED(x) [tmpq+(x-8)*16 + 32*4] 397%macro SPILL 2 ; xmm#, mempos 398 movaps SPILLED(%2), m%1 399%endmacro 400%macro UNSPILL 2 401 movaps m%1, SPILLED(%2) 402%endmacro 403%endif 404 405%macro DEFINE_FOUR_IMDCT 0 406cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp 407 movlps m0, [inq+64] 408 movhps m0, [inq+64 + 72] 409 movlps m3, [inq+64 + 2*72] 410 movhps m3, [inq+64 + 3*72] 411 412 shufps m5, m0, m3, 0xdd 413 shufps m0, m0, m3, 0x88 414 415 mova m1, [inq+48] 416 movu m6, [inq+48 + 72] 417 mova m7, [inq+48 + 2*72] 418 movu m3, [inq+48 + 3*72] 419 420 TRANSPOSE4x4PS 1, 6, 7, 3, 4 421 422 addps m4, m6, m7 423 mova [tmpq+4*28], m4 424 425 addps m7, m3 426 addps m6, m1 427 addps m3, m0 428 addps m0, m5 429 addps m0, m7 430 addps m7, m6 431 mova [tmpq+4*12], m7 432 SPILL 3, 12 433 434 mova m4, [inq+32] 435 movu m5, [inq+32 + 72] 436 mova m2, [inq+32 + 2*72] 437 movu m7, [inq+32 + 3*72] 438 439 TRANSPOSE4x4PS 4, 5, 2, 7, 3 440 441 addps m1, m7 442 SPILL 1, 11 443 444 addps m3, m5, m2 445 SPILL 3, 13 446 447 addps m7, m2 448 addps m5, m4 449 addps m6, m7 450 mova [tmpq], m6 451 addps m7, m5 452 mova [tmpq+4*16], m7 453 454 mova m2, [inq+16] 455 movu m7, [inq+16 + 72] 456 mova m1, [inq+16 + 2*72] 457 movu m6, [inq+16 + 3*72] 458 459 TRANSPOSE4x4PS 2, 7, 1, 6, 3 460 461 addps m4, m6 462 addps m6, m1 463 addps m1, m7 464 addps m7, m2 465 addps m5, m6 466 SPILL 5, 15 467 addps m6, m7 468 mulps m6, [costabs + 16*2] 469 mova [tmpq+4*8], m6 470 SPILL 1, 10 471 SPILL 0, 14 472 473 mova m1, [inq] 474 movu m6, [inq + 72] 475 mova m3, [inq + 2*72] 476 movu m5, [inq + 3*72] 477 478 TRANSPOSE4x4PS 1, 6, 3, 5, 0 479 480 addps m2, m5 481 addps m5, m3 482 addps m7, m5 483 addps m3, m6 484 addps m6, m1 485 SPILL 7, 8 486 addps m5, m6 487 SPILL 6, 9 488 addps m6, m4, SPILLED(12) 489 subps m6, m2 490 UNSPILL 7, 11 491 SPILL 5, 11 492 subps m5, m1, m7 493 mulps m7, [costabs + 16*5] 494 addps m7, m1 495 mulps m0, m6, [costabs + 16*6] 496 addps m0, m5 497 mova [tmpq+4*24], m0 498 addps m6, m5 499 mova [tmpq+4*4], m6 500 addps m6, m4, m2 501 mulps m6, [costabs + 16*1] 502 subps m4, SPILLED(12) 503 mulps m4, [costabs + 16*8] 504 addps m2, SPILLED(12) 505 mulps m2, [costabs + 16*3] 506 subps m5, m7, m6 507 subps m5, m2 508 addps m6, m7 509 addps m6, m4 510 addps m7, m2 511 subps m7, m4 512 mova [tmpq+4*20], m7 513 mova m2, [tmpq+4*28] 514 mova [tmpq+4*28], m5 515 UNSPILL 7, 13 516 subps m5, m7, m2 517 mulps m5, [costabs + 16*7] 518 UNSPILL 1, 10 519 mulps m1, [costabs + 16*2] 520 addps m4, m3, m2 521 mulps m4, [costabs + 16*4] 522 addps m2, m7 523 addps m7, m3 524 mulps m7, [costabs] 525 subps m3, m2 526 mulps m3, [costabs + 16*2] 527 addps m2, m7, m5 528 addps m2, m1 529 SPILL 2, 10 530 addps m7, m4 531 subps m7, m1 532 SPILL 7, 12 533 subps m5, m4 534 subps m5, m1 535 UNSPILL 0, 14 536 SPILL 5, 13 537 addps m1, m0, SPILLED(15) 538 subps m1, SPILLED(8) 539 mova m4, [costabs + 16*5] 540 mulps m4, [tmpq] 541 UNSPILL 2, 9 542 addps m4, m2 543 subps m2, [tmpq] 544 mulps m5, m1, [costabs + 16*6] 545 addps m5, m2 546 SPILL 5, 9 547 addps m2, m1 548 SPILL 2, 14 549 UNSPILL 5, 15 550 subps m7, m5, m0 551 addps m5, SPILLED(8) 552 mulps m5, [costabs + 16*1] 553 mulps m7, [costabs + 16*8] 554 addps m0, SPILLED(8) 555 mulps m0, [costabs + 16*3] 556 subps m2, m4, m5 557 subps m2, m0 558 SPILL 2, 15 559 addps m5, m4 560 addps m5, m7 561 addps m4, m0 562 subps m4, m7 563 SPILL 4, 8 564 mova m7, [tmpq+4*16] 565 mova m2, [tmpq+4*12] 566 addps m0, m7, m2 567 subps m0, SPILLED(11) 568 mulps m0, [costabs + 16*2] 569 addps m4, m7, SPILLED(11) 570 mulps m4, [costabs] 571 subps m7, m2 572 mulps m7, [costabs + 16*7] 573 addps m2, SPILLED(11) 574 mulps m2, [costabs + 16*4] 575 addps m1, m7, [tmpq+4*8] 576 addps m1, m4 577 addps m4, m2 578 subps m4, [tmpq+4*8] 579 SPILL 4, 11 580 subps m7, m2 581 subps m7, [tmpq+4*8] 582 addps m4, m6, SPILLED(10) 583 subps m6, SPILLED(10) 584 addps m2, m5, m1 585 mulps m2, [costabs + 16*9] 586 subps m5, m1 587 mulps m5, [costabs + 16*17] 588 subps m1, m4, m2 589 addps m4, m2 590 mulps m2, m1, [winq+4*36] 591 addps m2, [bufq+4*36] 592 mova [outq+1152], m2 593 mulps m1, [winq+4*32] 594 addps m1, [bufq+4*32] 595 mova [outq+1024], m1 596 mulps m1, m4, [winq+4*116] 597 mova [bufq+4*36], m1 598 mulps m4, [winq+4*112] 599 mova [bufq+4*32], m4 600 addps m2, m6, m5 601 subps m6, m5 602 mulps m1, m6, [winq+4*68] 603 addps m1, [bufq+4*68] 604 mova [outq+2176], m1 605 mulps m6, [winq] 606 addps m6, [bufq] 607 mova [outq], m6 608 mulps m1, m2, [winq+4*148] 609 mova [bufq+4*68], m1 610 mulps m2, [winq+4*80] 611 mova [bufq], m2 612 addps m5, m3, [tmpq+4*24] 613 mova m2, [tmpq+4*24] 614 subps m2, m3 615 mova m1, SPILLED(9) 616 subps m1, m0 617 mulps m1, [costabs + 16*10] 618 addps m0, SPILLED(9) 619 mulps m0, [costabs + 16*16] 620 addps m6, m5, m1 621 subps m5, m1 622 mulps m3, m5, [winq+4*40] 623 addps m3, [bufq+4*40] 624 mova [outq+1280], m3 625 mulps m5, [winq+4*28] 626 addps m5, [bufq+4*28] 627 mova [outq+896], m5 628 mulps m1, m6, [winq+4*120] 629 mova [bufq+4*40], m1 630 mulps m6, [winq+4*108] 631 mova [bufq+4*28], m6 632 addps m1, m2, m0 633 subps m2, m0 634 mulps m5, m2, [winq+4*64] 635 addps m5, [bufq+4*64] 636 mova [outq+2048], m5 637 mulps m2, [winq+4*4] 638 addps m2, [bufq+4*4] 639 mova [outq+128], m2 640 mulps m0, m1, [winq+4*144] 641 mova [bufq+4*64], m0 642 mulps m1, [winq+4*84] 643 mova [bufq+4*4], m1 644 mova m1, [tmpq+4*28] 645 mova m5, m1 646 addps m1, SPILLED(13) 647 subps m5, SPILLED(13) 648 UNSPILL 3, 15 649 addps m2, m7, m3 650 mulps m2, [costabs + 16*11] 651 subps m3, m7 652 mulps m3, [costabs + 16*15] 653 addps m0, m2, m1 654 subps m1, m2 655 SWAP m0, m2 656 mulps m6, m1, [winq+4*44] 657 addps m6, [bufq+4*44] 658 mova [outq+1408], m6 659 mulps m1, [winq+4*24] 660 addps m1, [bufq+4*24] 661 mova [outq+768], m1 662 mulps m0, m2, [winq+4*124] 663 mova [bufq+4*44], m0 664 mulps m2, [winq+4*104] 665 mova [bufq+4*24], m2 666 addps m0, m5, m3 667 subps m5, m3 668 mulps m1, m5, [winq+4*60] 669 addps m1, [bufq+4*60] 670 mova [outq+1920], m1 671 mulps m5, [winq+4*8] 672 addps m5, [bufq+4*8] 673 mova [outq+256], m5 674 mulps m1, m0, [winq+4*140] 675 mova [bufq+4*60], m1 676 mulps m0, [winq+4*88] 677 mova [bufq+4*8], m0 678 mova m1, [tmpq+4*20] 679 addps m1, SPILLED(12) 680 mova m2, [tmpq+4*20] 681 subps m2, SPILLED(12) 682 UNSPILL 7, 8 683 subps m0, m7, SPILLED(11) 684 addps m7, SPILLED(11) 685 mulps m4, m7, [costabs + 16*12] 686 mulps m0, [costabs + 16*14] 687 addps m5, m1, m4 688 subps m1, m4 689 mulps m7, m1, [winq+4*48] 690 addps m7, [bufq+4*48] 691 mova [outq+1536], m7 692 mulps m1, [winq+4*20] 693 addps m1, [bufq+4*20] 694 mova [outq+640], m1 695 mulps m1, m5, [winq+4*128] 696 mova [bufq+4*48], m1 697 mulps m5, [winq+4*100] 698 mova [bufq+4*20], m5 699 addps m6, m2, m0 700 subps m2, m0 701 mulps m1, m2, [winq+4*56] 702 addps m1, [bufq+4*56] 703 mova [outq+1792], m1 704 mulps m2, [winq+4*12] 705 addps m2, [bufq+4*12] 706 mova [outq+384], m2 707 mulps m0, m6, [winq+4*136] 708 mova [bufq+4*56], m0 709 mulps m6, [winq+4*92] 710 mova [bufq+4*12], m6 711 UNSPILL 0, 14 712 mulps m0, [costabs + 16*13] 713 mova m3, [tmpq+4*4] 714 addps m2, m0, m3 715 subps m3, m0 716 mulps m0, m3, [winq+4*52] 717 addps m0, [bufq+4*52] 718 mova [outq+1664], m0 719 mulps m3, [winq+4*16] 720 addps m3, [bufq+4*16] 721 mova [outq+512], m3 722 mulps m0, m2, [winq+4*132] 723 mova [bufq+4*52], m0 724 mulps m2, [winq+4*96] 725 mova [bufq+4*16], m2 726 RET 727%endmacro 728 729INIT_XMM sse 730DEFINE_FOUR_IMDCT 731 732%if HAVE_AVX_EXTERNAL 733INIT_XMM avx 734DEFINE_FOUR_IMDCT 735%endif 736