1;******************************************************************************
2;* 32 point SSE-optimized DCT transform
3;* Copyright (c) 2010 Vitor Sessak
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA 32
25
26ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
27
28ps_cos_vec: dd   0.500603,  0.505471,  0.515447,  0.531043
29            dd   0.553104,  0.582935,  0.622504,  0.674808
30            dd -10.190008, -3.407609, -2.057781, -1.484165
31            dd  -1.169440, -0.972568, -0.839350, -0.744536
32            dd   0.502419,  0.522499,  0.566944,  0.646822
33            dd   0.788155,  1.060678,  1.722447,  5.101149
34            dd   0.509796,  0.601345,  0.899976,  2.562916
35            dd   0.509796,  0.601345,  0.899976,  2.562916
36            dd   1.000000,  1.000000,  1.306563,  0.541196
37            dd   1.000000,  1.000000,  1.306563,  0.541196
38            dd   1.000000,  0.707107,  1.000000, -0.707107
39            dd   1.000000,  0.707107,  1.000000, -0.707107
40            dd   0.707107,  0.707107,  0.707107,  0.707107
41
42%macro BUTTERFLY 4
43    subps  %4, %1, %2
44    addps  %2, %2, %1
45    mulps  %1, %4, %3
46%endmacro
47
48%macro BUTTERFLY0 5
49%if cpuflag(sse2) && notcpuflag(avx)
50    pshufd %4, %1, %5
51    xorps  %1, %2
52    addps  %1, %4
53    mulps  %1, %3
54%else
55    shufps %4, %1, %1, %5
56    xorps  %1, %1, %2
57    addps  %4, %4, %1
58    mulps  %1, %4, %3
59%endif
60%endmacro
61
62%macro BUTTERFLY2 4
63    BUTTERFLY0 %1, %2, %3, %4, 0x1b
64%endmacro
65
66%macro BUTTERFLY3 4
67    BUTTERFLY0 %1, %2, %3, %4, 0xb1
68%endmacro
69
70%macro BUTTERFLY3V 5
71    movaps m%5, m%1
72    addps  m%1, m%2
73    subps  m%5, m%2
74    SWAP %2, %5
75    mulps  m%2, [ps_cos_vec+192]
76    movaps m%5, m%3
77    addps  m%3, m%4
78    subps  m%4, m%5
79    mulps  m%4, [ps_cos_vec+192]
80%endmacro
81
82%macro PASS6_AND_PERMUTE 0
83    mov         tmpd, [outq+4]
84    movss         m7, [outq+72]
85    addss         m7, [outq+76]
86    movss         m3, [outq+56]
87    addss         m3, [outq+60]
88    addss         m4, m3
89    movss         m2, [outq+52]
90    addss         m2, m3
91    movss         m3, [outq+104]
92    addss         m3, [outq+108]
93    addss         m1, m3
94    addss         m5, m4
95    movss [outq+ 16], m1
96    movss         m1, [outq+100]
97    addss         m1, m3
98    movss         m3, [outq+40]
99    movss [outq+ 48], m1
100    addss         m3, [outq+44]
101    movss         m1, [outq+100]
102    addss         m4, m3
103    addss         m3, m2
104    addss         m1, [outq+108]
105    movss [outq+ 40], m3
106    addss         m2, [outq+36]
107    movss         m3, [outq+8]
108    movss [outq+ 56], m2
109    addss         m3, [outq+12]
110    movss [outq+ 32], m3
111    movss         m3, [outq+80]
112    movss [outq+  8], m5
113    movss [outq+ 80], m1
114    movss         m2, [outq+52]
115    movss         m5, [outq+120]
116    addss         m5, [outq+124]
117    movss         m1, [outq+64]
118    addss         m2, [outq+60]
119    addss         m0, m5
120    addss         m5, [outq+116]
121    mov    [outq+64], tmpd
122    addss         m6, m0
123    addss         m1, m6
124    mov         tmpd, [outq+12]
125    mov   [outq+ 96], tmpd
126    movss [outq+  4], m1
127    movss         m1, [outq+24]
128    movss [outq+ 24], m4
129    movss         m4, [outq+88]
130    addss         m4, [outq+92]
131    addss         m3, m4
132    addss         m4, [outq+84]
133    mov         tmpd, [outq+108]
134    addss         m1, [outq+28]
135    addss         m0, m1
136    addss         m1, m5
137    addss         m6, m3
138    addss         m3, m0
139    addss         m0, m7
140    addss         m5, [outq+20]
141    addss         m7, m1
142    movss [outq+ 12], m6
143    mov   [outq+112], tmpd
144    movss         m6, [outq+28]
145    movss [outq+ 28], m0
146    movss         m0, [outq+36]
147    movss [outq+ 36], m7
148    addss         m1, m4
149    movss         m7, [outq+116]
150    addss         m0, m2
151    addss         m7, [outq+124]
152    movss [outq+ 72], m0
153    movss         m0, [outq+44]
154    addss         m2, m0
155    movss [outq+ 44], m1
156    movss [outq+ 88], m2
157    addss         m0, [outq+60]
158    mov         tmpd, [outq+60]
159    mov   [outq+120], tmpd
160    movss [outq+104], m0
161    addss         m4, m5
162    addss         m5, [outq+68]
163    movss  [outq+52], m4
164    movss  [outq+60], m5
165    movss         m4, [outq+68]
166    movss         m5, [outq+20]
167    movss [outq+ 20], m3
168    addss         m5, m7
169    addss         m7, m6
170    addss         m4, m5
171    movss         m2, [outq+84]
172    addss         m2, [outq+92]
173    addss         m5, m2
174    movss [outq+ 68], m4
175    addss         m2, m7
176    movss         m4, [outq+76]
177    movss [outq+ 84], m2
178    movss [outq+ 76], m5
179    addss         m7, m4
180    addss         m6, [outq+124]
181    addss         m4, m6
182    addss         m6, [outq+92]
183    movss [outq+100], m4
184    movss [outq+108], m6
185    movss         m6, [outq+92]
186    movss  [outq+92], m7
187    addss         m6, [outq+124]
188    movss [outq+116], m6
189%endmacro
190
191INIT_YMM avx
192SECTION .text
193%if HAVE_AVX_EXTERNAL
194; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
195cglobal dct32_float, 2,3,8, out, in, tmp
196    ; pass 1
197    vmovaps     m4, [inq+0]
198    vinsertf128 m5, m5, [inq+96], 1
199    vinsertf128 m5, m5, [inq+112], 0
200    vshufps     m5, m5, m5, 0x1b
201    BUTTERFLY   m4, m5, [ps_cos_vec], m6
202
203    vmovaps     m2, [inq+64]
204    vinsertf128 m6, m6, [inq+32], 1
205    vinsertf128 m6, m6, [inq+48], 0
206    vshufps     m6, m6, m6, 0x1b
207    BUTTERFLY   m2, m6, [ps_cos_vec+32], m0
208
209    ; pass 2
210
211    BUTTERFLY  m5, m6, [ps_cos_vec+64], m0
212    BUTTERFLY  m4, m2, [ps_cos_vec+64], m7
213
214
215    ; pass 3
216    vperm2f128  m3, m6, m4, 0x31
217    vperm2f128  m1, m6, m4, 0x20
218    vshufps     m3, m3, m3, 0x1b
219
220    BUTTERFLY   m1, m3, [ps_cos_vec+96], m6
221
222
223    vperm2f128  m4, m5, m2, 0x20
224    vperm2f128  m5, m5, m2, 0x31
225    vshufps     m5, m5, m5, 0x1b
226
227    BUTTERFLY   m4, m5, [ps_cos_vec+96], m6
228
229    ; pass 4
230    vmovaps m6, [ps_p1p1m1m1+0]
231    vmovaps m2, [ps_cos_vec+128]
232
233    BUTTERFLY2  m5, m6, m2, m7
234    BUTTERFLY2  m4, m6, m2, m7
235    BUTTERFLY2  m1, m6, m2, m7
236    BUTTERFLY2  m3, m6, m2, m7
237
238
239    ; pass 5
240    vshufps m6, m6, m6, 0xcc
241    vmovaps m2, [ps_cos_vec+160]
242
243    BUTTERFLY3  m5, m6, m2, m7
244    BUTTERFLY3  m4, m6, m2, m7
245    BUTTERFLY3  m1, m6, m2, m7
246    BUTTERFLY3  m3, m6, m2, m7
247
248    vperm2f128  m6, m3, m3, 0x31
249    vmovaps [outq], m3
250
251    vextractf128  [outq+64], m5, 1
252    vextractf128  [outq+32], m5, 0
253
254    vextractf128  [outq+80], m4, 1
255    vextractf128  [outq+48], m4, 0
256
257    vperm2f128  m0, m1, m1, 0x31
258    vmovaps [outq+96], m1
259
260    vzeroupper
261
262    ;    pass 6, no SIMD...
263INIT_XMM
264    PASS6_AND_PERMUTE
265    RET
266%endif
267
268%if ARCH_X86_64
269%define SPILL SWAP
270%define UNSPILL SWAP
271
272%macro PASS5 0
273    nop ; FIXME code alignment
274    SWAP 5, 8
275    SWAP 4, 12
276    SWAP 6, 14
277    SWAP 7, 13
278    SWAP 0, 15
279    PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
280    TRANSPOSE4x4PS 8, 9, 10, 11, 0
281    BUTTERFLY3V    8, 9, 10, 11, 0
282    addps   m10, m11
283    TRANSPOSE4x4PS 12, 13, 14, 15, 0
284    BUTTERFLY3V    12, 13, 14, 15, 0
285    addps   m14, m15
286    addps   m12, m14
287    addps   m14, m13
288    addps   m13, m15
289%endmacro
290
291%macro PASS6 0
292    SWAP 9, 12
293    SWAP 11, 14
294    movss [outq+0x00], m8
295    pshuflw m0, m8, 0xe
296    movss [outq+0x10], m9
297    pshuflw m1, m9, 0xe
298    movss [outq+0x20], m10
299    pshuflw m2, m10, 0xe
300    movss [outq+0x30], m11
301    pshuflw m3, m11, 0xe
302    movss [outq+0x40], m12
303    pshuflw m4, m12, 0xe
304    movss [outq+0x50], m13
305    pshuflw m5, m13, 0xe
306    movss [outq+0x60], m14
307    pshuflw m6, m14, 0xe
308    movaps [outq+0x70], m15
309    pshuflw m7, m15, 0xe
310    addss   m0, m1
311    addss   m1, m2
312    movss [outq+0x08], m0
313    addss   m2, m3
314    movss [outq+0x18], m1
315    addss   m3, m4
316    movss [outq+0x28], m2
317    addss   m4, m5
318    movss [outq+0x38], m3
319    addss   m5, m6
320    movss [outq+0x48], m4
321    addss   m6, m7
322    movss [outq+0x58], m5
323    movss [outq+0x68], m6
324    movss [outq+0x78], m7
325
326    PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
327    movhlps m0, m1
328    pshufd  m1, m1, 3
329    SWAP 0, 2, 4, 6, 8, 10, 12, 14
330    SWAP 1, 3, 5, 7, 9, 11, 13, 15
331%rep 7
332    movhlps m0, m1
333    pshufd  m1, m1, 3
334    addss   m15, m1
335    SWAP 0, 2, 4, 6, 8, 10, 12, 14
336    SWAP 1, 3, 5, 7, 9, 11, 13, 15
337%endrep
338%assign i 4
339%rep 15
340    addss m0, m1
341    movss [outq+i], m0
342    SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
343    %assign i i+8
344%endrep
345%endmacro
346
347%else ; ARCH_X86_32
348%macro SPILL 2 ; xmm#, mempos
349    movaps [outq+(%2-8)*16], m%1
350%endmacro
351%macro UNSPILL 2
352    movaps m%1, [outq+(%2-8)*16]
353%endmacro
354
355%define PASS6 PASS6_AND_PERMUTE
356%macro PASS5 0
357    movaps      m2, [ps_cos_vec+160]
358    shufps      m3, m3, 0xcc
359
360    BUTTERFLY3  m5, m3, m2, m1
361    SPILL 5, 8
362
363    UNSPILL 1, 9
364    BUTTERFLY3  m1, m3, m2, m5
365    SPILL 1, 14
366
367    BUTTERFLY3  m4, m3, m2, m5
368    SPILL 4, 12
369
370    BUTTERFLY3  m7, m3, m2, m5
371    SPILL 7, 13
372
373    UNSPILL 5, 10
374    BUTTERFLY3  m5, m3, m2, m7
375    SPILL 5, 10
376
377    UNSPILL 4, 11
378    BUTTERFLY3  m4, m3, m2, m7
379    SPILL 4, 11
380
381    BUTTERFLY3  m6, m3, m2, m7
382    SPILL 6, 9
383
384    BUTTERFLY3  m0, m3, m2, m7
385    SPILL 0, 15
386%endmacro
387%endif
388
389
390; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
391%macro DCT32_FUNC 0
392cglobal dct32_float, 2, 3, 16, out, in, tmp
393    ; pass 1
394
395    movaps      m0, [inq+0]
396    LOAD_INV    m1, [inq+112]
397    BUTTERFLY   m0, m1, [ps_cos_vec], m3
398
399    movaps      m7, [inq+64]
400    LOAD_INV    m4, [inq+48]
401    BUTTERFLY   m7, m4, [ps_cos_vec+32], m3
402
403    ; pass 2
404    movaps      m2, [ps_cos_vec+64]
405    BUTTERFLY   m1, m4, m2, m3
406    SPILL 1, 11
407    SPILL 4, 8
408
409    ; pass 1
410    movaps      m1, [inq+16]
411    LOAD_INV    m6, [inq+96]
412    BUTTERFLY   m1, m6, [ps_cos_vec+16], m3
413
414    movaps      m4, [inq+80]
415    LOAD_INV    m5, [inq+32]
416    BUTTERFLY   m4, m5, [ps_cos_vec+48], m3
417
418    ; pass 2
419    BUTTERFLY   m0, m7, m2, m3
420
421    movaps      m2, [ps_cos_vec+80]
422    BUTTERFLY   m6, m5, m2, m3
423
424    BUTTERFLY   m1, m4, m2, m3
425
426    ; pass 3
427    movaps      m2, [ps_cos_vec+96]
428    shufps      m1, m1, 0x1b
429    BUTTERFLY   m0, m1, m2, m3
430    SPILL 0, 15
431    SPILL 1, 14
432
433    UNSPILL 0, 8
434    shufps      m5, m5, 0x1b
435    BUTTERFLY   m0, m5, m2, m3
436
437    UNSPILL 1, 11
438    shufps      m6, m6, 0x1b
439    BUTTERFLY   m1, m6, m2, m3
440    SPILL 1, 11
441
442    shufps      m4, m4, 0x1b
443    BUTTERFLY   m7, m4, m2, m3
444
445    ; pass 4
446    movaps      m3, [ps_p1p1m1m1+0]
447    movaps      m2, [ps_cos_vec+128]
448
449    BUTTERFLY2  m5, m3, m2, m1
450
451    BUTTERFLY2  m0, m3, m2, m1
452    SPILL 0, 9
453
454    BUTTERFLY2  m6, m3, m2, m1
455    SPILL 6, 10
456
457    UNSPILL 0, 11
458    BUTTERFLY2  m0, m3, m2, m1
459    SPILL 0, 11
460
461    BUTTERFLY2  m4, m3, m2, m1
462
463    BUTTERFLY2  m7, m3, m2, m1
464
465    UNSPILL 6, 14
466    BUTTERFLY2  m6, m3, m2, m1
467
468    UNSPILL 0, 15
469    BUTTERFLY2  m0, m3, m2, m1
470
471    PASS5
472    PASS6
473    RET
474%endmacro
475
476%macro LOAD_INV 2
477%if cpuflag(sse2)
478    pshufd      %1, %2, 0x1b
479%elif cpuflag(sse)
480    movaps      %1, %2
481    shufps      %1, %1, 0x1b
482%endif
483%endmacro
484
485%if ARCH_X86_32
486INIT_XMM sse
487DCT32_FUNC
488%endif
489
490INIT_XMM sse2
491DCT32_FUNC
492