1;****************************************************************************** 2;* 36 point SSE-optimized IMDCT transform 3;* Copyright (c) 2011 Vitor Sessak 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 25 26ps_mask: dd 0, ~0, ~0, ~0 27ps_mask2: dd 0, ~0, 0, ~0 28ps_mask3: dd 0, 0, 0, ~0 29ps_mask4: dd 0, ~0, 0, 0 30 31ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038 32ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038 33ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433 34ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038 35ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530 36ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097 37ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097 38 39ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 40ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 41 42ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461 43 dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349 44 dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896 45 dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991 46 dd 1.0, 0.70710678118654752439, 0.0, 0.0 47 48ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461 49 dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349 50 dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896 51 dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991 52 dd 1.0, -0.70710678118654752439, 0.0, 0.0 53 54costabs: times 4 dd 0.98480773 55 times 4 dd 0.93969262 56 times 4 dd 0.86602539 57 times 4 dd -0.76604444 58 times 4 dd -0.64278764 59 times 4 dd 0.50000000 60 times 4 dd -0.50000000 61 times 4 dd -0.34202015 62 times 4 dd -0.17364818 63 times 4 dd 0.50190992 64 times 4 dd 0.51763808 65 times 4 dd 0.55168896 66 times 4 dd 0.61038726 67 times 4 dd 0.70710677 68 times 4 dd 0.87172341 69 times 4 dd 1.18310082 70 times 4 dd 1.93185163 71 times 4 dd 5.73685646 72 73%define SBLIMIT 32 74SECTION .text 75 76%macro PSHUFD 3 77%if cpuflag(sse2) && notcpuflag(avx) 78 pshufd %1, %2, %3 79%else 80 shufps %1, %2, %2, %3 81%endif 82%endmacro 83 84; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} 85; output %1={x3,x4,y1,y2} 86%macro BUILDINVHIGHLOW 3 87%if cpuflag(avx) 88 shufps %1, %2, %3, 0x4e 89%else 90 movlhps %1, %3 91 movhlps %1, %2 92%endif 93%endmacro 94 95; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} 96; output %1={x4,y1,y2,y3} 97%macro ROTLEFT 3 98%if cpuflag(ssse3) 99 palignr %1, %3, %2, 12 100%else 101 BUILDINVHIGHLOW %1, %2, %3 102 shufps %1, %1, %3, 0x99 103%endif 104%endmacro 105 106%macro INVERTHL 2 107%if cpuflag(sse2) 108 PSHUFD %1, %2, 0x4e 109%else 110 movhlps %1, %2 111 movlhps %1, %2 112%endif 113%endmacro 114 115%macro BUTTERF 3 116 INVERTHL %2, %1 117 xorps %1, [ps_p1p1m1m1] 118 addps %1, %2 119%if cpuflag(sse3) 120 mulps %1, %1, [ps_cosh_sse3 + %3] 121 PSHUFD %2, %1, 0xb1 122 addsubps %1, %1, %2 123%else 124 mulps %1, [ps_cosh + %3] 125 PSHUFD %2, %1, 0xb1 126 xorps %1, [ps_p1m1p1m1] 127 addps %1, %2 128%endif 129%endmacro 130 131%macro BUTTERF2 3 132%if cpuflag(sse3) 133 mulps %1, %1, [ps_cosh_sse3 + %3] 134 PSHUFD %2, %1, 0xe1 135 addsubps %1, %1, %2 136%else 137 mulps %1, [ps_cosh + %3] 138 PSHUFD %2, %1, 0xe1 139 xorps %1, [ps_p1m1p1m1] 140 addps %1, %2 141%endif 142%endmacro 143 144%macro STORE 4 145%if cpuflag(sse4) 146 movss [%3 ], %1 147 extractps dword [%3 + %4], %1, 1 148 extractps dword [%3 + 2*%4], %1, 2 149 extractps dword [%3 + 3*%4], %1, 3 150%else 151 movhlps %2, %1 152 movss [%3 ], %1 153 movss [%3 + 2*%4], %2 154 shufps %1, %1, 0xb1 155 movss [%3 + %4], %1 156 movhlps %2, %1 157 movss [%3 + 3*%4], %2 158%endif 159%endmacro 160 161%macro LOAD 4 162 movlps %1, [%3 ] 163 movhps %1, [%3 + %4] 164 movlps %2, [%3 + 2*%4] 165 movhps %2, [%3 + 3*%4] 166 shufps %1, %2, 0x88 167%endmacro 168 169%macro LOADA64 2 170%if cpuflag(avx) 171 movu %1, [%2] 172%else 173 movlps %1, [%2] 174 movhps %1, [%2 + 8] 175%endif 176%endmacro 177 178%macro DEFINE_IMDCT 0 179cglobal imdct36_float, 4,4,9, out, buf, in, win 180 181 ; for(i=17;i>=1;i--) in[i] += in[i-1]; 182 LOADA64 m0, inq 183 LOADA64 m1, inq + 16 184 185 ROTLEFT m5, m0, m1 186 187 PSHUFD m6, m0, 0x93 188 andps m6, m6, [ps_mask] 189 addps m0, m0, m6 190 191 LOADA64 m2, inq + 32 192 193 ROTLEFT m7, m1, m2 194 195 addps m1, m1, m5 196 LOADA64 m3, inq + 48 197 198 ROTLEFT m5, m2, m3 199 200 xorps m4, m4, m4 201 movlps m4, [inq+64] 202 BUILDINVHIGHLOW m6, m3, m4 203 shufps m6, m6, m4, 0xa9 204 205 addps m4, m4, m6 206 addps m2, m2, m7 207 addps m3, m3, m5 208 209 ; for(i=17;i>=3;i-=2) in[i] += in[i-2]; 210 movlhps m5, m5, m0 211 andps m5, m5, [ps_mask3] 212 213 BUILDINVHIGHLOW m7, m0, m1 214 andps m7, m7, [ps_mask2] 215 216 addps m0, m0, m5 217 218 BUILDINVHIGHLOW m6, m1, m2 219 andps m6, m6, [ps_mask2] 220 221 addps m1, m1, m7 222 223 BUILDINVHIGHLOW m7, m2, m3 224 andps m7, m7, [ps_mask2] 225 226 addps m2, m2, m6 227 228 movhlps m6, m6, m3 229 andps m6, m6, [ps_mask4] 230 231 addps m3, m3, m7 232 addps m4, m4, m6 233 234 ; Populate tmp[] 235 movlhps m6, m1, m5 ; zero out high values 236 subps m6, m6, m4 237 238 subps m5, m0, m3 239 240%if ARCH_X86_64 241 SWAP m5, m8 242%endif 243 244 mulps m7, m2, [ps_val1] 245 246%if ARCH_X86_64 247 mulps m5, m8, [ps_val2] 248%else 249 mulps m5, m5, [ps_val2] 250%endif 251 addps m7, m7, m5 252 253 mulps m5, m6, [ps_val1] 254 subps m7, m7, m5 255 256%if ARCH_X86_64 257 SWAP m5, m8 258%else 259 subps m5, m0, m3 260%endif 261 262 subps m5, m5, m6 263 addps m5, m5, m2 264 265 shufps m6, m4, m3, 0xe4 266 subps m6, m6, m2 267 mulps m6, m6, [ps_val3] 268 269 addps m4, m4, m1 270 mulps m4, m4, [ps_val4] 271 272 shufps m1, m1, m0, 0xe4 273 addps m1, m1, m2 274 mulps m1, m1, [ps_val5] 275 276 mulps m3, m3, [ps_val6] 277 mulps m0, m0, [ps_val7] 278 addps m0, m0, m3 279 280 xorps m2, m1, [ps_p1p1m1m1] 281 subps m2, m2, m4 282 addps m2, m2, m0 283 284 addps m3, m4, m0 285 subps m3, m3, m6 286 xorps m3, m3, [ps_p1p1m1m1] 287 288 shufps m0, m0, m4, 0xe4 289 subps m0, m0, m1 290 addps m0, m0, m6 291 292 BUILDINVHIGHLOW m4, m2, m3 293 shufps m3, m3, m2, 0x4e 294 295 ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5} 296 297 BUTTERF m0, m1, 0 298 BUTTERF m7, m2, 16 299 BUTTERF m3, m6, 32 300 BUTTERF m4, m1, 48 301 BUTTERF2 m5, m1, 64 302 303 ; permutates: 304 ; m0 0 1 2 3 => 2 6 10 14 m1 305 ; m7 4 5 6 7 => 3 7 11 15 m2 306 ; m3 8 9 10 11 => 17 13 9 5 m3 307 ; m4 12 13 14 15 => 16 12 8 4 m5 308 ; m5 16 17 xx xx => 0 1 xx xx m0 309 310 unpckhps m1, m0, m7 311 unpckhps m6, m3, m4 312 movhlps m2, m6, m1 313 movlhps m1, m1, m6 314 315 unpcklps m5, m5, m4 316 unpcklps m3, m3, m7 317 movhlps m4, m3, m5 318 movlhps m5, m5, m3 319 SWAP m4, m3 320 ; permutation done 321 322 PSHUFD m6, m2, 0xb1 323 movss m4, [bufq + 4*68] 324 movss m7, [bufq + 4*64] 325 unpcklps m7, m7, m4 326 mulps m6, m6, [winq + 16*4] 327 addps m6, m6, m7 328 movss [outq + 64*SBLIMIT], m6 329 shufps m6, m6, m6, 0xb1 330 movss [outq + 68*SBLIMIT], m6 331 332 mulps m6, m3, [winq + 4*4] 333 LOAD m4, m7, bufq + 4*16, 16 334 addps m6, m6, m4 335 STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT 336 337 shufps m4, m0, m3, 0xb5 338 mulps m4, m4, [winq + 8*4] 339 LOAD m7, m6, bufq + 4*32, 16 340 addps m4, m4, m7 341 STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT 342 343 shufps m3, m3, m2, 0xb1 344 mulps m3, m3, [winq + 12*4] 345 LOAD m7, m6, bufq + 4*48, 16 346 addps m3, m3, m7 347 STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT 348 349 mulps m2, m2, [winq] 350 LOAD m6, m7, bufq, 16 351 addps m2, m2, m6 352 STORE m2, m7, outq, 4*SBLIMIT 353 354 mulps m4, m1, [winq + 20*4] 355 STORE m4, m7, bufq, 16 356 357 mulps m3, m5, [winq + 24*4] 358 STORE m3, m7, bufq + 4*16, 16 359 360 shufps m0, m0, m5, 0xb0 361 mulps m0, m0, [winq + 28*4] 362 STORE m0, m7, bufq + 4*32, 16 363 364 shufps m5, m5, m1, 0xb1 365 mulps m5, m5, [winq + 32*4] 366 STORE m5, m7, bufq + 4*48, 16 367 368 shufps m1, m1, m1, 0xb1 369 mulps m1, m1, [winq + 36*4] 370 movss [bufq + 4*64], m1 371 shufps m1, m1, 0xb1 372 movss [bufq + 4*68], m1 373 RET 374%endmacro 375 376%if ARCH_X86_32 377INIT_XMM sse 378DEFINE_IMDCT 379%endif 380 381INIT_XMM sse2 382DEFINE_IMDCT 383 384INIT_XMM sse3 385DEFINE_IMDCT 386 387INIT_XMM ssse3 388DEFINE_IMDCT 389 390%if HAVE_AVX_EXTERNAL 391INIT_XMM avx 392DEFINE_IMDCT 393%endif 394 395INIT_XMM sse 396 397%if ARCH_X86_64 398%define SPILL SWAP 399%define UNSPILL SWAP 400%define SPILLED(x) m %+ x 401%else 402%define SPILLED(x) [tmpq+(x-8)*16 + 32*4] 403%macro SPILL 2 ; xmm#, mempos 404 movaps SPILLED(%2), m%1 405%endmacro 406%macro UNSPILL 2 407 movaps m%1, SPILLED(%2) 408%endmacro 409%endif 410 411%macro DEFINE_FOUR_IMDCT 0 412cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp 413 movlps m0, [inq+64] 414 movhps m0, [inq+64 + 72] 415 movlps m3, [inq+64 + 2*72] 416 movhps m3, [inq+64 + 3*72] 417 418 shufps m5, m0, m3, 0xdd 419 shufps m0, m0, m3, 0x88 420 421 mova m1, [inq+48] 422 movu m6, [inq+48 + 72] 423 mova m7, [inq+48 + 2*72] 424 movu m3, [inq+48 + 3*72] 425 426 TRANSPOSE4x4PS 1, 6, 7, 3, 4 427 428 addps m4, m6, m7 429 mova [tmpq+4*28], m4 430 431 addps m7, m3 432 addps m6, m1 433 addps m3, m0 434 addps m0, m5 435 addps m0, m7 436 addps m7, m6 437 mova [tmpq+4*12], m7 438 SPILL 3, 12 439 440 mova m4, [inq+32] 441 movu m5, [inq+32 + 72] 442 mova m2, [inq+32 + 2*72] 443 movu m7, [inq+32 + 3*72] 444 445 TRANSPOSE4x4PS 4, 5, 2, 7, 3 446 447 addps m1, m7 448 SPILL 1, 11 449 450 addps m3, m5, m2 451 SPILL 3, 13 452 453 addps m7, m2 454 addps m5, m4 455 addps m6, m7 456 mova [tmpq], m6 457 addps m7, m5 458 mova [tmpq+4*16], m7 459 460 mova m2, [inq+16] 461 movu m7, [inq+16 + 72] 462 mova m1, [inq+16 + 2*72] 463 movu m6, [inq+16 + 3*72] 464 465 TRANSPOSE4x4PS 2, 7, 1, 6, 3 466 467 addps m4, m6 468 addps m6, m1 469 addps m1, m7 470 addps m7, m2 471 addps m5, m6 472 SPILL 5, 15 473 addps m6, m7 474 mulps m6, [costabs + 16*2] 475 mova [tmpq+4*8], m6 476 SPILL 1, 10 477 SPILL 0, 14 478 479 mova m1, [inq] 480 movu m6, [inq + 72] 481 mova m3, [inq + 2*72] 482 movu m5, [inq + 3*72] 483 484 TRANSPOSE4x4PS 1, 6, 3, 5, 0 485 486 addps m2, m5 487 addps m5, m3 488 addps m7, m5 489 addps m3, m6 490 addps m6, m1 491 SPILL 7, 8 492 addps m5, m6 493 SPILL 6, 9 494 addps m6, m4, SPILLED(12) 495 subps m6, m2 496 UNSPILL 7, 11 497 SPILL 5, 11 498 subps m5, m1, m7 499 mulps m7, [costabs + 16*5] 500 addps m7, m1 501 mulps m0, m6, [costabs + 16*6] 502 addps m0, m5 503 mova [tmpq+4*24], m0 504 addps m6, m5 505 mova [tmpq+4*4], m6 506 addps m6, m4, m2 507 mulps m6, [costabs + 16*1] 508 subps m4, SPILLED(12) 509 mulps m4, [costabs + 16*8] 510 addps m2, SPILLED(12) 511 mulps m2, [costabs + 16*3] 512 subps m5, m7, m6 513 subps m5, m2 514 addps m6, m7 515 addps m6, m4 516 addps m7, m2 517 subps m7, m4 518 mova [tmpq+4*20], m7 519 mova m2, [tmpq+4*28] 520 mova [tmpq+4*28], m5 521 UNSPILL 7, 13 522 subps m5, m7, m2 523 mulps m5, [costabs + 16*7] 524 UNSPILL 1, 10 525 mulps m1, [costabs + 16*2] 526 addps m4, m3, m2 527 mulps m4, [costabs + 16*4] 528 addps m2, m7 529 addps m7, m3 530 mulps m7, [costabs] 531 subps m3, m2 532 mulps m3, [costabs + 16*2] 533 addps m2, m7, m5 534 addps m2, m1 535 SPILL 2, 10 536 addps m7, m4 537 subps m7, m1 538 SPILL 7, 12 539 subps m5, m4 540 subps m5, m1 541 UNSPILL 0, 14 542 SPILL 5, 13 543 addps m1, m0, SPILLED(15) 544 subps m1, SPILLED(8) 545 mova m4, [costabs + 16*5] 546 mulps m4, [tmpq] 547 UNSPILL 2, 9 548 addps m4, m2 549 subps m2, [tmpq] 550 mulps m5, m1, [costabs + 16*6] 551 addps m5, m2 552 SPILL 5, 9 553 addps m2, m1 554 SPILL 2, 14 555 UNSPILL 5, 15 556 subps m7, m5, m0 557 addps m5, SPILLED(8) 558 mulps m5, [costabs + 16*1] 559 mulps m7, [costabs + 16*8] 560 addps m0, SPILLED(8) 561 mulps m0, [costabs + 16*3] 562 subps m2, m4, m5 563 subps m2, m0 564 SPILL 2, 15 565 addps m5, m4 566 addps m5, m7 567 addps m4, m0 568 subps m4, m7 569 SPILL 4, 8 570 mova m7, [tmpq+4*16] 571 mova m2, [tmpq+4*12] 572 addps m0, m7, m2 573 subps m0, SPILLED(11) 574 mulps m0, [costabs + 16*2] 575 addps m4, m7, SPILLED(11) 576 mulps m4, [costabs] 577 subps m7, m2 578 mulps m7, [costabs + 16*7] 579 addps m2, SPILLED(11) 580 mulps m2, [costabs + 16*4] 581 addps m1, m7, [tmpq+4*8] 582 addps m1, m4 583 addps m4, m2 584 subps m4, [tmpq+4*8] 585 SPILL 4, 11 586 subps m7, m2 587 subps m7, [tmpq+4*8] 588 addps m4, m6, SPILLED(10) 589 subps m6, SPILLED(10) 590 addps m2, m5, m1 591 mulps m2, [costabs + 16*9] 592 subps m5, m1 593 mulps m5, [costabs + 16*17] 594 subps m1, m4, m2 595 addps m4, m2 596 mulps m2, m1, [winq+4*36] 597 addps m2, [bufq+4*36] 598 mova [outq+1152], m2 599 mulps m1, [winq+4*32] 600 addps m1, [bufq+4*32] 601 mova [outq+1024], m1 602 mulps m1, m4, [winq+4*116] 603 mova [bufq+4*36], m1 604 mulps m4, [winq+4*112] 605 mova [bufq+4*32], m4 606 addps m2, m6, m5 607 subps m6, m5 608 mulps m1, m6, [winq+4*68] 609 addps m1, [bufq+4*68] 610 mova [outq+2176], m1 611 mulps m6, [winq] 612 addps m6, [bufq] 613 mova [outq], m6 614 mulps m1, m2, [winq+4*148] 615 mova [bufq+4*68], m1 616 mulps m2, [winq+4*80] 617 mova [bufq], m2 618 addps m5, m3, [tmpq+4*24] 619 mova m2, [tmpq+4*24] 620 subps m2, m3 621 mova m1, SPILLED(9) 622 subps m1, m0 623 mulps m1, [costabs + 16*10] 624 addps m0, SPILLED(9) 625 mulps m0, [costabs + 16*16] 626 addps m6, m5, m1 627 subps m5, m1 628 mulps m3, m5, [winq+4*40] 629 addps m3, [bufq+4*40] 630 mova [outq+1280], m3 631 mulps m5, [winq+4*28] 632 addps m5, [bufq+4*28] 633 mova [outq+896], m5 634 mulps m1, m6, [winq+4*120] 635 mova [bufq+4*40], m1 636 mulps m6, [winq+4*108] 637 mova [bufq+4*28], m6 638 addps m1, m2, m0 639 subps m2, m0 640 mulps m5, m2, [winq+4*64] 641 addps m5, [bufq+4*64] 642 mova [outq+2048], m5 643 mulps m2, [winq+4*4] 644 addps m2, [bufq+4*4] 645 mova [outq+128], m2 646 mulps m0, m1, [winq+4*144] 647 mova [bufq+4*64], m0 648 mulps m1, [winq+4*84] 649 mova [bufq+4*4], m1 650 mova m1, [tmpq+4*28] 651 mova m5, m1 652 addps m1, SPILLED(13) 653 subps m5, SPILLED(13) 654 UNSPILL 3, 15 655 addps m2, m7, m3 656 mulps m2, [costabs + 16*11] 657 subps m3, m7 658 mulps m3, [costabs + 16*15] 659 addps m0, m2, m1 660 subps m1, m2 661 SWAP m0, m2 662 mulps m6, m1, [winq+4*44] 663 addps m6, [bufq+4*44] 664 mova [outq+1408], m6 665 mulps m1, [winq+4*24] 666 addps m1, [bufq+4*24] 667 mova [outq+768], m1 668 mulps m0, m2, [winq+4*124] 669 mova [bufq+4*44], m0 670 mulps m2, [winq+4*104] 671 mova [bufq+4*24], m2 672 addps m0, m5, m3 673 subps m5, m3 674 mulps m1, m5, [winq+4*60] 675 addps m1, [bufq+4*60] 676 mova [outq+1920], m1 677 mulps m5, [winq+4*8] 678 addps m5, [bufq+4*8] 679 mova [outq+256], m5 680 mulps m1, m0, [winq+4*140] 681 mova [bufq+4*60], m1 682 mulps m0, [winq+4*88] 683 mova [bufq+4*8], m0 684 mova m1, [tmpq+4*20] 685 addps m1, SPILLED(12) 686 mova m2, [tmpq+4*20] 687 subps m2, SPILLED(12) 688 UNSPILL 7, 8 689 subps m0, m7, SPILLED(11) 690 addps m7, SPILLED(11) 691 mulps m4, m7, [costabs + 16*12] 692 mulps m0, [costabs + 16*14] 693 addps m5, m1, m4 694 subps m1, m4 695 mulps m7, m1, [winq+4*48] 696 addps m7, [bufq+4*48] 697 mova [outq+1536], m7 698 mulps m1, [winq+4*20] 699 addps m1, [bufq+4*20] 700 mova [outq+640], m1 701 mulps m1, m5, [winq+4*128] 702 mova [bufq+4*48], m1 703 mulps m5, [winq+4*100] 704 mova [bufq+4*20], m5 705 addps m6, m2, m0 706 subps m2, m0 707 mulps m1, m2, [winq+4*56] 708 addps m1, [bufq+4*56] 709 mova [outq+1792], m1 710 mulps m2, [winq+4*12] 711 addps m2, [bufq+4*12] 712 mova [outq+384], m2 713 mulps m0, m6, [winq+4*136] 714 mova [bufq+4*56], m0 715 mulps m6, [winq+4*92] 716 mova [bufq+4*12], m6 717 UNSPILL 0, 14 718 mulps m0, [costabs + 16*13] 719 mova m3, [tmpq+4*4] 720 addps m2, m0, m3 721 subps m3, m0 722 mulps m0, m3, [winq+4*52] 723 addps m0, [bufq+4*52] 724 mova [outq+1664], m0 725 mulps m3, [winq+4*16] 726 addps m3, [bufq+4*16] 727 mova [outq+512], m3 728 mulps m0, m2, [winq+4*132] 729 mova [bufq+4*52], m0 730 mulps m2, [winq+4*96] 731 mova [bufq+4*16], m2 732 RET 733%endmacro 734 735INIT_XMM sse 736DEFINE_FOUR_IMDCT 737 738%if HAVE_AVX_EXTERNAL 739INIT_XMM avx 740DEFINE_FOUR_IMDCT 741%endif 742