1;******************************************************************************
2;* 36 point SSE-optimized IMDCT transform
3;* Copyright (c) 2011 Vitor Sessak
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25
26ps_mask:  dd 0, ~0, ~0, ~0
27ps_mask2: dd 0, ~0,  0, ~0
28ps_mask3: dd 0,  0,  0, ~0
29ps_mask4: dd 0, ~0,  0,  0
30
31ps_val1:  dd          -0.5,          -0.5, -0.8660254038, -0.8660254038
32ps_val2:  dd           1.0,           1.0,  0.8660254038,  0.8660254038
33ps_val3:  dd  0.1736481777,  0.1736481777,  0.3420201433,  0.3420201433
34ps_val4:  dd -0.7660444431, -0.7660444431,  0.8660254038,  0.8660254038
35ps_val5:  dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
36ps_val6:  dd           0.5,           0.5, -0.6427876097, -0.6427876097
37ps_val7:  dd           1.0,           1.0, -0.6427876097, -0.6427876097
38
39ps_p1p1m1m1: dd 0,          0, 0x80000000, 0x80000000
40ps_p1m1p1m1: dd 0, 0x80000000,          0, 0x80000000
41
42ps_cosh:       dd 1.0, 0.50190991877167369479,  1.0,  5.73685662283492756461
43               dd 1.0, 0.51763809020504152469,  1.0,  1.93185165257813657349
44               dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
45               dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
46               dd 1.0, 0.70710678118654752439,  0.0,  0.0
47
48ps_cosh_sse3:  dd 1.0, -0.50190991877167369479,  1.0, -5.73685662283492756461
49               dd 1.0, -0.51763809020504152469,  1.0, -1.93185165257813657349
50               dd 1.0, -0.55168895948124587824, -1.0,  1.18310079157624925896
51               dd 1.0, -0.61038729438072803416, -1.0,  0.87172339781054900991
52               dd 1.0, -0.70710678118654752439,  0.0,  0.0
53
54costabs:  times 4 dd  0.98480773
55          times 4 dd  0.93969262
56          times 4 dd  0.86602539
57          times 4 dd -0.76604444
58          times 4 dd -0.64278764
59          times 4 dd  0.50000000
60          times 4 dd -0.50000000
61          times 4 dd -0.34202015
62          times 4 dd -0.17364818
63          times 4 dd  0.50190992
64          times 4 dd  0.51763808
65          times 4 dd  0.55168896
66          times 4 dd  0.61038726
67          times 4 dd  0.70710677
68          times 4 dd  0.87172341
69          times 4 dd  1.18310082
70          times 4 dd  1.93185163
71          times 4 dd  5.73685646
72
73%define SBLIMIT 32
74SECTION .text
75
76%macro PSHUFD 3
77%if cpuflag(sse2) && notcpuflag(avx)
78    pshufd %1, %2, %3
79%else
80    shufps %1, %2, %2, %3
81%endif
82%endmacro
83
84; input  %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
85; output %1={x3,x4,y1,y2}
86%macro BUILDINVHIGHLOW 3
87%if cpuflag(avx)
88    shufps %1, %2, %3, 0x4e
89%else
90    movlhps %1, %3
91    movhlps %1, %2
92%endif
93%endmacro
94
95; input  %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
96; output %1={x4,y1,y2,y3}
97%macro ROTLEFT 3
98%if cpuflag(ssse3)
99    palignr  %1, %3, %2, 12
100%else
101    BUILDINVHIGHLOW %1, %2, %3
102    shufps  %1, %1, %3, 0x99
103%endif
104%endmacro
105
106%macro INVERTHL 2
107%if cpuflag(sse2)
108    PSHUFD  %1, %2, 0x4e
109%else
110    movhlps %1, %2
111    movlhps %1, %2
112%endif
113%endmacro
114
115%macro BUTTERF 3
116    INVERTHL %2, %1
117    xorps    %1, [ps_p1p1m1m1]
118    addps    %1, %2
119%if cpuflag(sse3)
120    mulps    %1, %1, [ps_cosh_sse3 + %3]
121    PSHUFD   %2, %1, 0xb1
122    addsubps %1, %1, %2
123%else
124    mulps    %1, [ps_cosh + %3]
125    PSHUFD   %2, %1, 0xb1
126    xorps    %1, [ps_p1m1p1m1]
127    addps    %1, %2
128%endif
129%endmacro
130
131%macro BUTTERF2 3
132%if cpuflag(sse3)
133    mulps    %1, %1, [ps_cosh_sse3 + %3]
134    PSHUFD   %2, %1, 0xe1
135    addsubps %1, %1, %2
136%else
137    mulps    %1, [ps_cosh + %3]
138    PSHUFD   %2, %1, 0xe1
139    xorps    %1, [ps_p1m1p1m1]
140    addps    %1, %2
141%endif
142%endmacro
143
144%macro STORE 4
145%if cpuflag(sse4)
146    movss     [%3       ], %1
147    extractps dword [%3 +   %4], %1, 1
148    extractps dword [%3 + 2*%4], %1, 2
149    extractps dword [%3 + 3*%4], %1, 3
150%else
151    movhlps %2, %1
152    movss   [%3       ], %1
153    movss   [%3 + 2*%4], %2
154    shufps  %1, %1, 0xb1
155    movss   [%3 +   %4], %1
156    movhlps %2, %1
157    movss   [%3 + 3*%4], %2
158%endif
159%endmacro
160
161%macro LOAD 4
162    movlps  %1, [%3       ]
163    movhps  %1, [%3 +   %4]
164    movlps  %2, [%3 + 2*%4]
165    movhps  %2, [%3 + 3*%4]
166    shufps  %1, %2, 0x88
167%endmacro
168
169%macro LOADA64 2
170%if cpuflag(avx)
171   movu     %1, [%2]
172%else
173   movlps   %1, [%2]
174   movhps   %1, [%2 + 8]
175%endif
176%endmacro
177
178%macro DEFINE_IMDCT 0
179cglobal imdct36_float, 4,4,9, out, buf, in, win
180
181    ; for(i=17;i>=1;i--) in[i] += in[i-1];
182    LOADA64 m0, inq
183    LOADA64 m1, inq + 16
184
185    ROTLEFT m5, m0, m1
186
187    PSHUFD  m6, m0, 0x93
188    andps   m6, m6, [ps_mask]
189    addps   m0, m0, m6
190
191    LOADA64 m2, inq + 32
192
193    ROTLEFT m7, m1, m2
194
195    addps   m1, m1, m5
196    LOADA64 m3, inq + 48
197
198    ROTLEFT m5, m2, m3
199
200    xorps   m4, m4, m4
201    movlps  m4, [inq+64]
202    BUILDINVHIGHLOW m6, m3, m4
203    shufps  m6, m6, m4, 0xa9
204
205    addps   m4, m4, m6
206    addps   m2, m2, m7
207    addps   m3, m3, m5
208
209    ; for(i=17;i>=3;i-=2) in[i] += in[i-2];
210    movlhps m5, m5, m0
211    andps   m5, m5, [ps_mask3]
212
213    BUILDINVHIGHLOW m7, m0, m1
214    andps   m7, m7, [ps_mask2]
215
216    addps   m0, m0, m5
217
218    BUILDINVHIGHLOW m6, m1, m2
219    andps   m6, m6, [ps_mask2]
220
221    addps  m1, m1, m7
222
223    BUILDINVHIGHLOW m7, m2, m3
224    andps   m7, m7, [ps_mask2]
225
226    addps   m2, m2, m6
227
228    movhlps m6, m6, m3
229    andps   m6, m6, [ps_mask4]
230
231    addps  m3, m3, m7
232    addps  m4, m4, m6
233
234    ; Populate tmp[]
235    movlhps m6, m1, m5    ; zero out high values
236    subps   m6, m6, m4
237
238    subps  m5, m0, m3
239
240%if ARCH_X86_64
241    SWAP   m5, m8
242%endif
243
244    mulps  m7, m2, [ps_val1]
245
246%if ARCH_X86_64
247    mulps  m5, m8, [ps_val2]
248%else
249    mulps  m5, m5, [ps_val2]
250%endif
251    addps  m7, m7, m5
252
253    mulps  m5, m6, [ps_val1]
254    subps  m7, m7, m5
255
256%if ARCH_X86_64
257    SWAP   m5, m8
258%else
259    subps  m5, m0, m3
260%endif
261
262    subps  m5, m5, m6
263    addps  m5, m5, m2
264
265    shufps m6, m4, m3, 0xe4
266    subps  m6, m6, m2
267    mulps  m6, m6, [ps_val3]
268
269    addps  m4, m4, m1
270    mulps  m4, m4, [ps_val4]
271
272    shufps m1, m1, m0, 0xe4
273    addps  m1, m1, m2
274    mulps  m1, m1, [ps_val5]
275
276    mulps  m3, m3, [ps_val6]
277    mulps  m0, m0, [ps_val7]
278    addps  m0, m0, m3
279
280    xorps  m2, m1, [ps_p1p1m1m1]
281    subps  m2, m2, m4
282    addps  m2, m2, m0
283
284    addps  m3, m4, m0
285    subps  m3, m3, m6
286    xorps  m3, m3, [ps_p1p1m1m1]
287
288    shufps m0, m0, m4, 0xe4
289    subps  m0, m0, m1
290    addps  m0, m0, m6
291
292    BUILDINVHIGHLOW m4, m2, m3
293    shufps  m3, m3, m2, 0x4e
294
295    ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
296
297    BUTTERF  m0, m1, 0
298    BUTTERF  m7, m2, 16
299    BUTTERF  m3, m6, 32
300    BUTTERF  m4, m1, 48
301    BUTTERF2 m5, m1, 64
302
303    ; permutates:
304    ; m0    0  1  2  3     =>     2  6 10 14   m1
305    ; m7    4  5  6  7     =>     3  7 11 15   m2
306    ; m3    8  9 10 11     =>    17 13  9  5   m3
307    ; m4   12 13 14 15     =>    16 12  8  4   m5
308    ; m5   16 17 xx xx     =>     0  1 xx xx   m0
309
310    unpckhps m1, m0, m7
311    unpckhps m6, m3, m4
312    movhlps  m2, m6, m1
313    movlhps  m1, m1, m6
314
315    unpcklps m5, m5, m4
316    unpcklps m3, m3, m7
317    movhlps  m4, m3, m5
318    movlhps  m5, m5, m3
319    SWAP m4, m3
320    ; permutation done
321
322    PSHUFD  m6, m2, 0xb1
323    movss   m4, [bufq + 4*68]
324    movss   m7, [bufq + 4*64]
325    unpcklps  m7, m7, m4
326    mulps   m6, m6, [winq + 16*4]
327    addps   m6, m6, m7
328    movss   [outq + 64*SBLIMIT], m6
329    shufps  m6, m6, m6, 0xb1
330    movss   [outq + 68*SBLIMIT], m6
331
332    mulps   m6, m3, [winq + 4*4]
333    LOAD    m4, m7, bufq + 4*16, 16
334    addps   m6, m6, m4
335    STORE   m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
336
337    shufps  m4, m0, m3, 0xb5
338    mulps   m4, m4, [winq + 8*4]
339    LOAD    m7, m6, bufq + 4*32, 16
340    addps   m4, m4, m7
341    STORE   m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
342
343    shufps  m3, m3, m2, 0xb1
344    mulps   m3, m3, [winq + 12*4]
345    LOAD    m7, m6, bufq + 4*48, 16
346    addps   m3, m3, m7
347    STORE   m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
348
349    mulps   m2, m2, [winq]
350    LOAD    m6, m7, bufq, 16
351    addps   m2, m2, m6
352    STORE   m2, m7, outq, 4*SBLIMIT
353
354    mulps    m4, m1, [winq + 20*4]
355    STORE    m4, m7, bufq, 16
356
357    mulps    m3, m5, [winq + 24*4]
358    STORE    m3, m7, bufq + 4*16, 16
359
360    shufps   m0, m0, m5, 0xb0
361    mulps    m0, m0, [winq + 28*4]
362    STORE    m0, m7, bufq + 4*32, 16
363
364    shufps   m5, m5, m1, 0xb1
365    mulps    m5, m5, [winq + 32*4]
366    STORE    m5, m7, bufq + 4*48, 16
367
368    shufps   m1, m1, m1, 0xb1
369    mulps    m1, m1, [winq + 36*4]
370    movss    [bufq + 4*64], m1
371    shufps   m1, m1, 0xb1
372    movss    [bufq + 4*68], m1
373    RET
374%endmacro
375
376%if ARCH_X86_32
377INIT_XMM sse
378DEFINE_IMDCT
379%endif
380
381INIT_XMM sse2
382DEFINE_IMDCT
383
384INIT_XMM sse3
385DEFINE_IMDCT
386
387INIT_XMM ssse3
388DEFINE_IMDCT
389
390%if HAVE_AVX_EXTERNAL
391INIT_XMM avx
392DEFINE_IMDCT
393%endif
394
395INIT_XMM sse
396
397%if ARCH_X86_64
398%define SPILL SWAP
399%define UNSPILL SWAP
400%define SPILLED(x) m %+ x
401%else
402%define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
403%macro SPILL 2 ; xmm#, mempos
404    movaps SPILLED(%2), m%1
405%endmacro
406%macro UNSPILL 2
407    movaps m%1, SPILLED(%2)
408%endmacro
409%endif
410
411%macro DEFINE_FOUR_IMDCT 0
412cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
413    movlps  m0, [inq+64]
414    movhps  m0, [inq+64 +   72]
415    movlps  m3, [inq+64 + 2*72]
416    movhps  m3, [inq+64 + 3*72]
417
418    shufps  m5, m0, m3, 0xdd
419    shufps  m0, m0, m3, 0x88
420
421    mova     m1, [inq+48]
422    movu     m6, [inq+48 +   72]
423    mova     m7, [inq+48 + 2*72]
424    movu     m3, [inq+48 + 3*72]
425
426    TRANSPOSE4x4PS 1, 6, 7, 3, 4
427
428    addps   m4, m6, m7
429    mova    [tmpq+4*28], m4
430
431    addps    m7, m3
432    addps    m6, m1
433    addps    m3, m0
434    addps    m0, m5
435    addps    m0, m7
436    addps    m7, m6
437    mova    [tmpq+4*12], m7
438    SPILL   3, 12
439
440    mova     m4, [inq+32]
441    movu     m5, [inq+32 +   72]
442    mova     m2, [inq+32 + 2*72]
443    movu     m7, [inq+32 + 3*72]
444
445    TRANSPOSE4x4PS 4, 5, 2, 7, 3
446
447    addps   m1, m7
448    SPILL   1, 11
449
450    addps   m3, m5, m2
451    SPILL   3, 13
452
453    addps    m7, m2
454    addps    m5, m4
455    addps    m6, m7
456    mova    [tmpq], m6
457    addps   m7, m5
458    mova    [tmpq+4*16], m7
459
460    mova    m2, [inq+16]
461    movu    m7, [inq+16 +   72]
462    mova    m1, [inq+16 + 2*72]
463    movu    m6, [inq+16 + 3*72]
464
465    TRANSPOSE4x4PS 2, 7, 1, 6, 3
466
467    addps   m4, m6
468    addps   m6, m1
469    addps   m1, m7
470    addps   m7, m2
471    addps   m5, m6
472    SPILL   5, 15
473    addps   m6, m7
474    mulps   m6, [costabs + 16*2]
475    mova    [tmpq+4*8], m6
476    SPILL   1, 10
477    SPILL   0, 14
478
479    mova    m1, [inq]
480    movu    m6, [inq +   72]
481    mova    m3, [inq + 2*72]
482    movu    m5, [inq + 3*72]
483
484    TRANSPOSE4x4PS 1, 6, 3, 5, 0
485
486    addps    m2, m5
487    addps    m5, m3
488    addps    m7, m5
489    addps    m3, m6
490    addps    m6, m1
491    SPILL    7, 8
492    addps    m5, m6
493    SPILL    6, 9
494    addps    m6, m4, SPILLED(12)
495    subps    m6, m2
496    UNSPILL  7, 11
497    SPILL    5, 11
498    subps    m5, m1, m7
499    mulps    m7, [costabs + 16*5]
500    addps    m7, m1
501    mulps    m0, m6, [costabs + 16*6]
502    addps    m0, m5
503    mova     [tmpq+4*24], m0
504    addps    m6, m5
505    mova     [tmpq+4*4], m6
506    addps    m6, m4, m2
507    mulps    m6, [costabs + 16*1]
508    subps    m4, SPILLED(12)
509    mulps    m4, [costabs + 16*8]
510    addps    m2, SPILLED(12)
511    mulps    m2, [costabs + 16*3]
512    subps    m5, m7, m6
513    subps    m5, m2
514    addps    m6, m7
515    addps    m6, m4
516    addps    m7, m2
517    subps    m7, m4
518    mova     [tmpq+4*20], m7
519    mova     m2, [tmpq+4*28]
520    mova     [tmpq+4*28], m5
521    UNSPILL  7, 13
522    subps    m5, m7, m2
523    mulps    m5, [costabs + 16*7]
524    UNSPILL  1, 10
525    mulps    m1, [costabs + 16*2]
526    addps    m4, m3, m2
527    mulps    m4, [costabs + 16*4]
528    addps    m2, m7
529    addps    m7, m3
530    mulps    m7, [costabs]
531    subps    m3, m2
532    mulps    m3, [costabs + 16*2]
533    addps    m2, m7, m5
534    addps    m2, m1
535    SPILL    2, 10
536    addps    m7, m4
537    subps    m7, m1
538    SPILL    7, 12
539    subps    m5, m4
540    subps    m5, m1
541    UNSPILL  0, 14
542    SPILL    5, 13
543    addps    m1, m0, SPILLED(15)
544    subps    m1, SPILLED(8)
545    mova     m4, [costabs + 16*5]
546    mulps    m4, [tmpq]
547    UNSPILL  2, 9
548    addps    m4, m2
549    subps    m2, [tmpq]
550    mulps    m5, m1, [costabs + 16*6]
551    addps    m5, m2
552    SPILL    5, 9
553    addps    m2, m1
554    SPILL    2, 14
555    UNSPILL  5, 15
556    subps    m7, m5, m0
557    addps    m5, SPILLED(8)
558    mulps    m5, [costabs + 16*1]
559    mulps    m7, [costabs + 16*8]
560    addps    m0, SPILLED(8)
561    mulps    m0, [costabs + 16*3]
562    subps    m2, m4, m5
563    subps    m2, m0
564    SPILL    2, 15
565    addps    m5, m4
566    addps    m5, m7
567    addps    m4, m0
568    subps    m4, m7
569    SPILL    4, 8
570    mova     m7, [tmpq+4*16]
571    mova     m2, [tmpq+4*12]
572    addps    m0, m7, m2
573    subps    m0, SPILLED(11)
574    mulps    m0, [costabs + 16*2]
575    addps    m4, m7, SPILLED(11)
576    mulps    m4, [costabs]
577    subps    m7, m2
578    mulps    m7, [costabs + 16*7]
579    addps    m2, SPILLED(11)
580    mulps    m2, [costabs + 16*4]
581    addps    m1, m7, [tmpq+4*8]
582    addps    m1, m4
583    addps    m4, m2
584    subps    m4, [tmpq+4*8]
585    SPILL    4, 11
586    subps    m7, m2
587    subps    m7, [tmpq+4*8]
588    addps    m4, m6, SPILLED(10)
589    subps    m6, SPILLED(10)
590    addps    m2, m5, m1
591    mulps    m2, [costabs + 16*9]
592    subps    m5, m1
593    mulps    m5, [costabs + 16*17]
594    subps    m1, m4, m2
595    addps    m4, m2
596    mulps    m2, m1, [winq+4*36]
597    addps    m2, [bufq+4*36]
598    mova     [outq+1152], m2
599    mulps    m1, [winq+4*32]
600    addps    m1, [bufq+4*32]
601    mova     [outq+1024], m1
602    mulps    m1, m4, [winq+4*116]
603    mova     [bufq+4*36], m1
604    mulps    m4, [winq+4*112]
605    mova     [bufq+4*32], m4
606    addps    m2, m6, m5
607    subps    m6, m5
608    mulps    m1, m6, [winq+4*68]
609    addps    m1, [bufq+4*68]
610    mova     [outq+2176], m1
611    mulps    m6, [winq]
612    addps    m6, [bufq]
613    mova     [outq], m6
614    mulps    m1, m2, [winq+4*148]
615    mova     [bufq+4*68], m1
616    mulps    m2, [winq+4*80]
617    mova     [bufq], m2
618    addps    m5, m3, [tmpq+4*24]
619    mova     m2, [tmpq+4*24]
620    subps    m2, m3
621    mova     m1, SPILLED(9)
622    subps    m1, m0
623    mulps    m1, [costabs + 16*10]
624    addps    m0, SPILLED(9)
625    mulps    m0, [costabs + 16*16]
626    addps    m6, m5, m1
627    subps    m5, m1
628    mulps    m3, m5, [winq+4*40]
629    addps    m3, [bufq+4*40]
630    mova     [outq+1280], m3
631    mulps    m5, [winq+4*28]
632    addps    m5, [bufq+4*28]
633    mova     [outq+896], m5
634    mulps    m1, m6, [winq+4*120]
635    mova     [bufq+4*40], m1
636    mulps    m6, [winq+4*108]
637    mova     [bufq+4*28], m6
638    addps    m1, m2, m0
639    subps    m2, m0
640    mulps    m5, m2, [winq+4*64]
641    addps    m5, [bufq+4*64]
642    mova     [outq+2048], m5
643    mulps    m2, [winq+4*4]
644    addps    m2, [bufq+4*4]
645    mova     [outq+128], m2
646    mulps    m0, m1, [winq+4*144]
647    mova     [bufq+4*64], m0
648    mulps    m1, [winq+4*84]
649    mova     [bufq+4*4], m1
650    mova     m1, [tmpq+4*28]
651    mova     m5, m1
652    addps    m1, SPILLED(13)
653    subps    m5, SPILLED(13)
654    UNSPILL  3, 15
655    addps    m2, m7, m3
656    mulps    m2, [costabs + 16*11]
657    subps    m3, m7
658    mulps    m3, [costabs + 16*15]
659    addps    m0, m2, m1
660    subps    m1, m2
661    SWAP     m0, m2
662    mulps    m6, m1, [winq+4*44]
663    addps    m6, [bufq+4*44]
664    mova     [outq+1408], m6
665    mulps    m1, [winq+4*24]
666    addps    m1, [bufq+4*24]
667    mova     [outq+768], m1
668    mulps    m0, m2, [winq+4*124]
669    mova     [bufq+4*44], m0
670    mulps    m2, [winq+4*104]
671    mova     [bufq+4*24], m2
672    addps    m0, m5, m3
673    subps    m5, m3
674    mulps    m1, m5, [winq+4*60]
675    addps    m1, [bufq+4*60]
676    mova     [outq+1920], m1
677    mulps    m5, [winq+4*8]
678    addps    m5, [bufq+4*8]
679    mova     [outq+256], m5
680    mulps    m1, m0, [winq+4*140]
681    mova     [bufq+4*60], m1
682    mulps    m0, [winq+4*88]
683    mova     [bufq+4*8], m0
684    mova     m1, [tmpq+4*20]
685    addps    m1, SPILLED(12)
686    mova     m2, [tmpq+4*20]
687    subps    m2, SPILLED(12)
688    UNSPILL  7, 8
689    subps    m0, m7, SPILLED(11)
690    addps    m7, SPILLED(11)
691    mulps    m4, m7, [costabs + 16*12]
692    mulps    m0, [costabs + 16*14]
693    addps    m5, m1, m4
694    subps    m1, m4
695    mulps    m7, m1, [winq+4*48]
696    addps    m7, [bufq+4*48]
697    mova     [outq+1536], m7
698    mulps    m1, [winq+4*20]
699    addps    m1, [bufq+4*20]
700    mova     [outq+640], m1
701    mulps    m1, m5, [winq+4*128]
702    mova     [bufq+4*48], m1
703    mulps    m5, [winq+4*100]
704    mova     [bufq+4*20], m5
705    addps    m6, m2, m0
706    subps    m2, m0
707    mulps    m1, m2, [winq+4*56]
708    addps    m1, [bufq+4*56]
709    mova     [outq+1792], m1
710    mulps    m2, [winq+4*12]
711    addps    m2, [bufq+4*12]
712    mova     [outq+384], m2
713    mulps    m0, m6, [winq+4*136]
714    mova    [bufq+4*56], m0
715    mulps    m6, [winq+4*92]
716    mova     [bufq+4*12], m6
717    UNSPILL  0, 14
718    mulps    m0, [costabs + 16*13]
719    mova     m3, [tmpq+4*4]
720    addps    m2, m0, m3
721    subps    m3, m0
722    mulps    m0, m3, [winq+4*52]
723    addps    m0, [bufq+4*52]
724    mova     [outq+1664], m0
725    mulps    m3, [winq+4*16]
726    addps    m3, [bufq+4*16]
727    mova     [outq+512], m3
728    mulps    m0, m2, [winq+4*132]
729    mova     [bufq+4*52], m0
730    mulps    m2, [winq+4*96]
731    mova     [bufq+4*16], m2
732    RET
733%endmacro
734
735INIT_XMM sse
736DEFINE_FOUR_IMDCT
737
738%if HAVE_AVX_EXTERNAL
739INIT_XMM avx
740DEFINE_FOUR_IMDCT
741%endif
742