1;******************************************************************************
2;* 36 point SSE-optimized IMDCT transform
3;* Copyright (c) 2011 Vitor Sessak
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25
26align 16
27ps_mask:  dd 0, ~0, ~0, ~0
28ps_mask2: dd 0, ~0,  0, ~0
29ps_mask3: dd 0,  0,  0, ~0
30ps_mask4: dd 0, ~0,  0,  0
31
32ps_val1:  dd          -0.5,          -0.5, -0.8660254038, -0.8660254038
33ps_val2:  dd           1.0,           1.0,  0.8660254038,  0.8660254038
34ps_val3:  dd  0.1736481777,  0.1736481777,  0.3420201433,  0.3420201433
35ps_val4:  dd -0.7660444431, -0.7660444431,  0.8660254038,  0.8660254038
36ps_val5:  dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
37ps_val6:  dd           0.5,           0.5, -0.6427876097, -0.6427876097
38ps_val7:  dd           1.0,           1.0, -0.6427876097, -0.6427876097
39
40ps_p1p1m1m1: dd 0,          0, 0x80000000, 0x80000000
41ps_p1m1p1m1: dd 0, 0x80000000,          0, 0x80000000
42
43ps_cosh:       dd 1.0, 0.50190991877167369479,  1.0,  5.73685662283492756461
44               dd 1.0, 0.51763809020504152469,  1.0,  1.93185165257813657349
45               dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
46               dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
47               dd 1.0, 0.70710678118654752439,  0.0,  0.0
48
49ps_cosh_sse3:  dd 1.0, -0.50190991877167369479,  1.0, -5.73685662283492756461
50               dd 1.0, -0.51763809020504152469,  1.0, -1.93185165257813657349
51               dd 1.0, -0.55168895948124587824, -1.0,  1.18310079157624925896
52               dd 1.0, -0.61038729438072803416, -1.0,  0.87172339781054900991
53               dd 1.0, -0.70710678118654752439,  0.0,  0.0
54
55costabs:  times 4 dd  0.98480773
56          times 4 dd  0.93969262
57          times 4 dd  0.86602539
58          times 4 dd -0.76604444
59          times 4 dd -0.64278764
60          times 4 dd  0.50000000
61          times 4 dd -0.50000000
62          times 4 dd -0.34202015
63          times 4 dd -0.17364818
64          times 4 dd  0.50190992
65          times 4 dd  0.51763808
66          times 4 dd  0.55168896
67          times 4 dd  0.61038726
68          times 4 dd  0.70710677
69          times 4 dd  0.87172341
70          times 4 dd  1.18310082
71          times 4 dd  1.93185163
72          times 4 dd  5.73685646
73
74%define SBLIMIT 32
75SECTION_TEXT
76
77%macro PSHUFD 3
78%if cpuflag(sse2) && notcpuflag(avx)
79    pshufd %1, %2, %3
80%else
81    shufps %1, %2, %2, %3
82%endif
83%endmacro
84
85; input  %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
86; output %1={x3,x4,y1,y2}
87%macro BUILDINVHIGHLOW 3
88%if cpuflag(avx)
89    shufps %1, %2, %3, 0x4e
90%else
91    movlhps %1, %3
92    movhlps %1, %2
93%endif
94%endmacro
95
96; input  %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
97; output %1={x4,y1,y2,y3}
98%macro ROTLEFT 3
99%if cpuflag(ssse3)
100    palignr  %1, %3, %2, 12
101%else
102    BUILDINVHIGHLOW %1, %2, %3
103    shufps  %1, %1, %3, 0x99
104%endif
105%endmacro
106
107%macro INVERTHL 2
108%if cpuflag(sse2)
109    PSHUFD  %1, %2, 0x4e
110%else
111    movhlps %1, %2
112    movlhps %1, %2
113%endif
114%endmacro
115
116%macro BUTTERF 3
117    INVERTHL %2, %1
118    xorps    %1, [ps_p1p1m1m1]
119    addps    %1, %2
120%if cpuflag(sse3)
121    mulps    %1, %1, [ps_cosh_sse3 + %3]
122    PSHUFD   %2, %1, 0xb1
123    addsubps %1, %1, %2
124%else
125    mulps    %1, [ps_cosh + %3]
126    PSHUFD   %2, %1, 0xb1
127    xorps    %1, [ps_p1m1p1m1]
128    addps    %1, %2
129%endif
130%endmacro
131
132%macro BUTTERF2 3
133%if cpuflag(sse3)
134    mulps    %1, %1, [ps_cosh_sse3 + %3]
135    PSHUFD   %2, %1, 0xe1
136    addsubps %1, %1, %2
137%else
138    mulps    %1, [ps_cosh + %3]
139    PSHUFD   %2, %1, 0xe1
140    xorps    %1, [ps_p1m1p1m1]
141    addps    %1, %2
142%endif
143%endmacro
144
145%macro STORE 4
146    movhlps %2, %1
147    movss   [%3       ], %1
148    movss   [%3 + 2*%4], %2
149    shufps  %1, %1, 0xb1
150    movss   [%3 +   %4], %1
151    movhlps %2, %1
152    movss   [%3 + 3*%4], %2
153%endmacro
154
155%macro LOAD 4
156    movlps  %1, [%3       ]
157    movhps  %1, [%3 +   %4]
158    movlps  %2, [%3 + 2*%4]
159    movhps  %2, [%3 + 3*%4]
160    shufps  %1, %2, 0x88
161%endmacro
162
163%macro LOADA64 2
164%if cpuflag(avx)
165   movu     %1, [%2]
166%else
167   movlps   %1, [%2]
168   movhps   %1, [%2 + 8]
169%endif
170%endmacro
171
172%macro DEFINE_IMDCT 0
173cglobal imdct36_float, 4,4,9, out, buf, in, win
174
175    ; for(i=17;i>=1;i--) in[i] += in[i-1];
176    LOADA64 m0, inq
177    LOADA64 m1, inq + 16
178
179    ROTLEFT m5, m0, m1
180
181    PSHUFD  m6, m0, 0x93
182    andps   m6, m6, [ps_mask]
183    addps   m0, m0, m6
184
185    LOADA64 m2, inq + 32
186
187    ROTLEFT m7, m1, m2
188
189    addps   m1, m1, m5
190    LOADA64 m3, inq + 48
191
192    ROTLEFT m5, m2, m3
193
194    xorps   m4, m4, m4
195    movlps  m4, [inq+64]
196    BUILDINVHIGHLOW m6, m3, m4
197    shufps  m6, m6, m4, 0xa9
198
199    addps   m4, m4, m6
200    addps   m2, m2, m7
201    addps   m3, m3, m5
202
203    ; for(i=17;i>=3;i-=2) in[i] += in[i-2];
204    movlhps m5, m5, m0
205    andps   m5, m5, [ps_mask3]
206
207    BUILDINVHIGHLOW m7, m0, m1
208    andps   m7, m7, [ps_mask2]
209
210    addps   m0, m0, m5
211
212    BUILDINVHIGHLOW m6, m1, m2
213    andps   m6, m6, [ps_mask2]
214
215    addps  m1, m1, m7
216
217    BUILDINVHIGHLOW m7, m2, m3
218    andps   m7, m7, [ps_mask2]
219
220    addps   m2, m2, m6
221
222    movhlps m6, m6, m3
223    andps   m6, m6, [ps_mask4]
224
225    addps  m3, m3, m7
226    addps  m4, m4, m6
227
228    ; Populate tmp[]
229    movlhps m6, m1, m5    ; zero out high values
230    subps   m6, m6, m4
231
232    subps  m5, m0, m3
233
234%if ARCH_X86_64
235    SWAP   m5, m8
236%endif
237
238    mulps  m7, m2, [ps_val1]
239
240%if ARCH_X86_64
241    mulps  m5, m8, [ps_val2]
242%else
243    mulps  m5, m5, [ps_val2]
244%endif
245    addps  m7, m7, m5
246
247    mulps  m5, m6, [ps_val1]
248    subps  m7, m7, m5
249
250%if ARCH_X86_64
251    SWAP   m5, m8
252%else
253    subps  m5, m0, m3
254%endif
255
256    subps  m5, m5, m6
257    addps  m5, m5, m2
258
259    shufps m6, m4, m3, 0xe4
260    subps  m6, m6, m2
261    mulps  m6, m6, [ps_val3]
262
263    addps  m4, m4, m1
264    mulps  m4, m4, [ps_val4]
265
266    shufps m1, m1, m0, 0xe4
267    addps  m1, m1, m2
268    mulps  m1, m1, [ps_val5]
269
270    mulps  m3, m3, [ps_val6]
271    mulps  m0, m0, [ps_val7]
272    addps  m0, m0, m3
273
274    xorps  m2, m1, [ps_p1p1m1m1]
275    subps  m2, m2, m4
276    addps  m2, m2, m0
277
278    addps  m3, m4, m0
279    subps  m3, m3, m6
280    xorps  m3, m3, [ps_p1p1m1m1]
281
282    shufps m0, m0, m4, 0xe4
283    subps  m0, m0, m1
284    addps  m0, m0, m6
285
286    BUILDINVHIGHLOW m4, m2, m3
287    shufps  m3, m3, m2, 0x4e
288
289    ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
290
291    BUTTERF  m0, m1, 0
292    BUTTERF  m7, m2, 16
293    BUTTERF  m3, m6, 32
294    BUTTERF  m4, m1, 48
295    BUTTERF2 m5, m1, 64
296
297    ; permutates:
298    ; m0    0  1  2  3     =>     2  6 10 14   m1
299    ; m7    4  5  6  7     =>     3  7 11 15   m2
300    ; m3    8  9 10 11     =>    17 13  9  5   m3
301    ; m4   12 13 14 15     =>    16 12  8  4   m5
302    ; m5   16 17 xx xx     =>     0  1 xx xx   m0
303
304    unpckhps m1, m0, m7
305    unpckhps m6, m3, m4
306    movhlps  m2, m6, m1
307    movlhps  m1, m1, m6
308
309    unpcklps m5, m5, m4
310    unpcklps m3, m3, m7
311    movhlps  m4, m3, m5
312    movlhps  m5, m5, m3
313    SWAP m4, m3
314    ; permutation done
315
316    PSHUFD  m6, m2, 0xb1
317    movss   m4, [bufq + 4*68]
318    movss   m7, [bufq + 4*64]
319    unpcklps  m7, m7, m4
320    mulps   m6, m6, [winq + 16*4]
321    addps   m6, m6, m7
322    movss   [outq + 64*SBLIMIT], m6
323    shufps  m6, m6, m6, 0xb1
324    movss   [outq + 68*SBLIMIT], m6
325
326    mulps   m6, m3, [winq + 4*4]
327    LOAD    m4, m7, bufq + 4*16, 16
328    addps   m6, m6, m4
329    STORE   m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
330
331    shufps  m4, m0, m3, 0xb5
332    mulps   m4, m4, [winq + 8*4]
333    LOAD    m7, m6, bufq + 4*32, 16
334    addps   m4, m4, m7
335    STORE   m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
336
337    shufps  m3, m3, m2, 0xb1
338    mulps   m3, m3, [winq + 12*4]
339    LOAD    m7, m6, bufq + 4*48, 16
340    addps   m3, m3, m7
341    STORE   m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
342
343    mulps   m2, m2, [winq]
344    LOAD    m6, m7, bufq, 16
345    addps   m2, m2, m6
346    STORE   m2, m7, outq, 4*SBLIMIT
347
348    mulps    m4, m1, [winq + 20*4]
349    STORE    m4, m7, bufq, 16
350
351    mulps    m3, m5, [winq + 24*4]
352    STORE    m3, m7, bufq + 4*16, 16
353
354    shufps   m0, m0, m5, 0xb0
355    mulps    m0, m0, [winq + 28*4]
356    STORE    m0, m7, bufq + 4*32, 16
357
358    shufps   m5, m5, m1, 0xb1
359    mulps    m5, m5, [winq + 32*4]
360    STORE    m5, m7, bufq + 4*48, 16
361
362    shufps   m1, m1, m1, 0xb1
363    mulps    m1, m1, [winq + 36*4]
364    movss    [bufq + 4*64], m1
365    shufps   m1, m1, 0xb1
366    movss    [bufq + 4*68], m1
367    RET
368%endmacro
369
370%if ARCH_X86_32
371INIT_XMM sse
372DEFINE_IMDCT
373%endif
374
375INIT_XMM sse2
376DEFINE_IMDCT
377
378INIT_XMM sse3
379DEFINE_IMDCT
380
381INIT_XMM ssse3
382DEFINE_IMDCT
383
384%if HAVE_AVX_EXTERNAL
385INIT_XMM avx
386DEFINE_IMDCT
387%endif
388
389INIT_XMM sse
390
391%if ARCH_X86_64
392%define SPILL SWAP
393%define UNSPILL SWAP
394%define SPILLED(x) m %+ x
395%else
396%define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
397%macro SPILL 2 ; xmm#, mempos
398    movaps SPILLED(%2), m%1
399%endmacro
400%macro UNSPILL 2
401    movaps m%1, SPILLED(%2)
402%endmacro
403%endif
404
405%macro DEFINE_FOUR_IMDCT 0
406cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
407    movlps  m0, [inq+64]
408    movhps  m0, [inq+64 +   72]
409    movlps  m3, [inq+64 + 2*72]
410    movhps  m3, [inq+64 + 3*72]
411
412    shufps  m5, m0, m3, 0xdd
413    shufps  m0, m0, m3, 0x88
414
415    mova     m1, [inq+48]
416    movu     m6, [inq+48 +   72]
417    mova     m7, [inq+48 + 2*72]
418    movu     m3, [inq+48 + 3*72]
419
420    TRANSPOSE4x4PS 1, 6, 7, 3, 4
421
422    addps   m4, m6, m7
423    mova    [tmpq+4*28], m4
424
425    addps    m7, m3
426    addps    m6, m1
427    addps    m3, m0
428    addps    m0, m5
429    addps    m0, m7
430    addps    m7, m6
431    mova    [tmpq+4*12], m7
432    SPILL   3, 12
433
434    mova     m4, [inq+32]
435    movu     m5, [inq+32 +   72]
436    mova     m2, [inq+32 + 2*72]
437    movu     m7, [inq+32 + 3*72]
438
439    TRANSPOSE4x4PS 4, 5, 2, 7, 3
440
441    addps   m1, m7
442    SPILL   1, 11
443
444    addps   m3, m5, m2
445    SPILL   3, 13
446
447    addps    m7, m2
448    addps    m5, m4
449    addps    m6, m7
450    mova    [tmpq], m6
451    addps   m7, m5
452    mova    [tmpq+4*16], m7
453
454    mova    m2, [inq+16]
455    movu    m7, [inq+16 +   72]
456    mova    m1, [inq+16 + 2*72]
457    movu    m6, [inq+16 + 3*72]
458
459    TRANSPOSE4x4PS 2, 7, 1, 6, 3
460
461    addps   m4, m6
462    addps   m6, m1
463    addps   m1, m7
464    addps   m7, m2
465    addps   m5, m6
466    SPILL   5, 15
467    addps   m6, m7
468    mulps   m6, [costabs + 16*2]
469    mova    [tmpq+4*8], m6
470    SPILL   1, 10
471    SPILL   0, 14
472
473    mova    m1, [inq]
474    movu    m6, [inq +   72]
475    mova    m3, [inq + 2*72]
476    movu    m5, [inq + 3*72]
477
478    TRANSPOSE4x4PS 1, 6, 3, 5, 0
479
480    addps    m2, m5
481    addps    m5, m3
482    addps    m7, m5
483    addps    m3, m6
484    addps    m6, m1
485    SPILL    7, 8
486    addps    m5, m6
487    SPILL    6, 9
488    addps    m6, m4, SPILLED(12)
489    subps    m6, m2
490    UNSPILL  7, 11
491    SPILL    5, 11
492    subps    m5, m1, m7
493    mulps    m7, [costabs + 16*5]
494    addps    m7, m1
495    mulps    m0, m6, [costabs + 16*6]
496    addps    m0, m5
497    mova     [tmpq+4*24], m0
498    addps    m6, m5
499    mova     [tmpq+4*4], m6
500    addps    m6, m4, m2
501    mulps    m6, [costabs + 16*1]
502    subps    m4, SPILLED(12)
503    mulps    m4, [costabs + 16*8]
504    addps    m2, SPILLED(12)
505    mulps    m2, [costabs + 16*3]
506    subps    m5, m7, m6
507    subps    m5, m2
508    addps    m6, m7
509    addps    m6, m4
510    addps    m7, m2
511    subps    m7, m4
512    mova     [tmpq+4*20], m7
513    mova     m2, [tmpq+4*28]
514    mova     [tmpq+4*28], m5
515    UNSPILL  7, 13
516    subps    m5, m7, m2
517    mulps    m5, [costabs + 16*7]
518    UNSPILL  1, 10
519    mulps    m1, [costabs + 16*2]
520    addps    m4, m3, m2
521    mulps    m4, [costabs + 16*4]
522    addps    m2, m7
523    addps    m7, m3
524    mulps    m7, [costabs]
525    subps    m3, m2
526    mulps    m3, [costabs + 16*2]
527    addps    m2, m7, m5
528    addps    m2, m1
529    SPILL    2, 10
530    addps    m7, m4
531    subps    m7, m1
532    SPILL    7, 12
533    subps    m5, m4
534    subps    m5, m1
535    UNSPILL  0, 14
536    SPILL    5, 13
537    addps    m1, m0, SPILLED(15)
538    subps    m1, SPILLED(8)
539    mova     m4, [costabs + 16*5]
540    mulps    m4, [tmpq]
541    UNSPILL  2, 9
542    addps    m4, m2
543    subps    m2, [tmpq]
544    mulps    m5, m1, [costabs + 16*6]
545    addps    m5, m2
546    SPILL    5, 9
547    addps    m2, m1
548    SPILL    2, 14
549    UNSPILL  5, 15
550    subps    m7, m5, m0
551    addps    m5, SPILLED(8)
552    mulps    m5, [costabs + 16*1]
553    mulps    m7, [costabs + 16*8]
554    addps    m0, SPILLED(8)
555    mulps    m0, [costabs + 16*3]
556    subps    m2, m4, m5
557    subps    m2, m0
558    SPILL    2, 15
559    addps    m5, m4
560    addps    m5, m7
561    addps    m4, m0
562    subps    m4, m7
563    SPILL    4, 8
564    mova     m7, [tmpq+4*16]
565    mova     m2, [tmpq+4*12]
566    addps    m0, m7, m2
567    subps    m0, SPILLED(11)
568    mulps    m0, [costabs + 16*2]
569    addps    m4, m7, SPILLED(11)
570    mulps    m4, [costabs]
571    subps    m7, m2
572    mulps    m7, [costabs + 16*7]
573    addps    m2, SPILLED(11)
574    mulps    m2, [costabs + 16*4]
575    addps    m1, m7, [tmpq+4*8]
576    addps    m1, m4
577    addps    m4, m2
578    subps    m4, [tmpq+4*8]
579    SPILL    4, 11
580    subps    m7, m2
581    subps    m7, [tmpq+4*8]
582    addps    m4, m6, SPILLED(10)
583    subps    m6, SPILLED(10)
584    addps    m2, m5, m1
585    mulps    m2, [costabs + 16*9]
586    subps    m5, m1
587    mulps    m5, [costabs + 16*17]
588    subps    m1, m4, m2
589    addps    m4, m2
590    mulps    m2, m1, [winq+4*36]
591    addps    m2, [bufq+4*36]
592    mova     [outq+1152], m2
593    mulps    m1, [winq+4*32]
594    addps    m1, [bufq+4*32]
595    mova     [outq+1024], m1
596    mulps    m1, m4, [winq+4*116]
597    mova     [bufq+4*36], m1
598    mulps    m4, [winq+4*112]
599    mova     [bufq+4*32], m4
600    addps    m2, m6, m5
601    subps    m6, m5
602    mulps    m1, m6, [winq+4*68]
603    addps    m1, [bufq+4*68]
604    mova     [outq+2176], m1
605    mulps    m6, [winq]
606    addps    m6, [bufq]
607    mova     [outq], m6
608    mulps    m1, m2, [winq+4*148]
609    mova     [bufq+4*68], m1
610    mulps    m2, [winq+4*80]
611    mova     [bufq], m2
612    addps    m5, m3, [tmpq+4*24]
613    mova     m2, [tmpq+4*24]
614    subps    m2, m3
615    mova     m1, SPILLED(9)
616    subps    m1, m0
617    mulps    m1, [costabs + 16*10]
618    addps    m0, SPILLED(9)
619    mulps    m0, [costabs + 16*16]
620    addps    m6, m5, m1
621    subps    m5, m1
622    mulps    m3, m5, [winq+4*40]
623    addps    m3, [bufq+4*40]
624    mova     [outq+1280], m3
625    mulps    m5, [winq+4*28]
626    addps    m5, [bufq+4*28]
627    mova     [outq+896], m5
628    mulps    m1, m6, [winq+4*120]
629    mova     [bufq+4*40], m1
630    mulps    m6, [winq+4*108]
631    mova     [bufq+4*28], m6
632    addps    m1, m2, m0
633    subps    m2, m0
634    mulps    m5, m2, [winq+4*64]
635    addps    m5, [bufq+4*64]
636    mova     [outq+2048], m5
637    mulps    m2, [winq+4*4]
638    addps    m2, [bufq+4*4]
639    mova     [outq+128], m2
640    mulps    m0, m1, [winq+4*144]
641    mova     [bufq+4*64], m0
642    mulps    m1, [winq+4*84]
643    mova     [bufq+4*4], m1
644    mova     m1, [tmpq+4*28]
645    mova     m5, m1
646    addps    m1, SPILLED(13)
647    subps    m5, SPILLED(13)
648    UNSPILL  3, 15
649    addps    m2, m7, m3
650    mulps    m2, [costabs + 16*11]
651    subps    m3, m7
652    mulps    m3, [costabs + 16*15]
653    addps    m0, m2, m1
654    subps    m1, m2
655    SWAP     m0, m2
656    mulps    m6, m1, [winq+4*44]
657    addps    m6, [bufq+4*44]
658    mova     [outq+1408], m6
659    mulps    m1, [winq+4*24]
660    addps    m1, [bufq+4*24]
661    mova     [outq+768], m1
662    mulps    m0, m2, [winq+4*124]
663    mova     [bufq+4*44], m0
664    mulps    m2, [winq+4*104]
665    mova     [bufq+4*24], m2
666    addps    m0, m5, m3
667    subps    m5, m3
668    mulps    m1, m5, [winq+4*60]
669    addps    m1, [bufq+4*60]
670    mova     [outq+1920], m1
671    mulps    m5, [winq+4*8]
672    addps    m5, [bufq+4*8]
673    mova     [outq+256], m5
674    mulps    m1, m0, [winq+4*140]
675    mova     [bufq+4*60], m1
676    mulps    m0, [winq+4*88]
677    mova     [bufq+4*8], m0
678    mova     m1, [tmpq+4*20]
679    addps    m1, SPILLED(12)
680    mova     m2, [tmpq+4*20]
681    subps    m2, SPILLED(12)
682    UNSPILL  7, 8
683    subps    m0, m7, SPILLED(11)
684    addps    m7, SPILLED(11)
685    mulps    m4, m7, [costabs + 16*12]
686    mulps    m0, [costabs + 16*14]
687    addps    m5, m1, m4
688    subps    m1, m4
689    mulps    m7, m1, [winq+4*48]
690    addps    m7, [bufq+4*48]
691    mova     [outq+1536], m7
692    mulps    m1, [winq+4*20]
693    addps    m1, [bufq+4*20]
694    mova     [outq+640], m1
695    mulps    m1, m5, [winq+4*128]
696    mova     [bufq+4*48], m1
697    mulps    m5, [winq+4*100]
698    mova     [bufq+4*20], m5
699    addps    m6, m2, m0
700    subps    m2, m0
701    mulps    m1, m2, [winq+4*56]
702    addps    m1, [bufq+4*56]
703    mova     [outq+1792], m1
704    mulps    m2, [winq+4*12]
705    addps    m2, [bufq+4*12]
706    mova     [outq+384], m2
707    mulps    m0, m6, [winq+4*136]
708    mova    [bufq+4*56], m0
709    mulps    m6, [winq+4*92]
710    mova     [bufq+4*12], m6
711    UNSPILL  0, 14
712    mulps    m0, [costabs + 16*13]
713    mova     m3, [tmpq+4*4]
714    addps    m2, m0, m3
715    subps    m3, m0
716    mulps    m0, m3, [winq+4*52]
717    addps    m0, [bufq+4*52]
718    mova     [outq+1664], m0
719    mulps    m3, [winq+4*16]
720    addps    m3, [bufq+4*16]
721    mova     [outq+512], m3
722    mulps    m0, m2, [winq+4*132]
723    mova     [bufq+4*52], m0
724    mulps    m2, [winq+4*96]
725    mova     [bufq+4*16], m2
726    RET
727%endmacro
728
729INIT_XMM sse
730DEFINE_FOUR_IMDCT
731
732%if HAVE_AVX_EXTERNAL
733INIT_XMM avx
734DEFINE_FOUR_IMDCT
735%endif
736