1;******************************************************************************
2;* SIMD optimized non-power-of-two MDCT functions
3;*
4;* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA 32
26
27perm_neg: dd 2, 5, 3, 4, 6, 1, 7, 0
28perm_pos: dd 0, 7, 1, 6, 4, 3, 5, 2
29sign_adjust_r: times 4 dd 0x80000000, 0x00000000
30
31sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000
32
33SECTION .text
34
35%if ARCH_X86_64
36
37;*****************************************************************************************
38;void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
39;*****************************************************************************************
40%macro FFT5 3 ; %1 - in_offset, %2 - dst1 (64bit used), %3 - dst2
41    VBROADCASTSD m0, [inq + %1]         ; in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im
42    movsd   xm1, [inq + 1*16 +  8 + %1] ; in[ 3].re, in[ 3].im,         0,         0
43    movsd   xm4, [inq + 6*16 +  0 + %1] ; in[12].re, in[12].im,         0,         0
44    movhps  xm1, [inq + 3*16 +  0 + %1] ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
45    movhps  xm4, [inq + 4*16 +  8 + %1] ; in[12].re, in[12].im, in[ 9].re, in[ 9].im
46
47    subps       xm2,  xm1, xm4          ; t[2].im, t[2].re, t[3].im, t[3].re
48    addps       xm1,  xm4               ; t[0].re, t[0].im, t[1].re, t[1].im
49
50    movhlps     %2,   xm1               ; t[0].re, t[1].re, t[0].im, t[1].im
51    addps       %2,   xm1
52    addps       %2,   xm0               ; DC[0].re, DC[0].im, junk...
53    movlhps     %2,   %2                ; DC[0].re, DC[0].im, DC[0].re, DC[0].im
54
55    shufps      xm3,  xm1, xm2, q0110   ; t[0].re, t[0].im, t[2].re, t[2].im
56    shufps      xm1,  xm2, q2332        ; t[1].re, t[1].im, t[3].re, t[3].im
57
58    mulps       xm%3, xm1, xm5
59    mulps       xm4,  xm3, xm6
60    mulps       xm1,  xm6
61
62    xorps       xm1,  xm7
63    mulps       xm3,  xm5
64    addsubps    xm3,  xm1               ; t[0].re, t[0].im, t[2].re, t[2].im
65    subps       xm%3, xm4               ; t[4].re, t[4].im, t[5].re, t[5].im
66
67    movhlps     xm2, xm%3, xm3          ; t[2].re, t[2].im, t[5].re, t[5].im
68    movlhps     xm3, xm%3               ; t[0].re, t[0].im, t[4].re, t[4].im
69
70    xorps       xm2,  xm7
71    addps       xm%3, xm2, xm3
72    subps       xm3,  xm2
73
74    shufps      xm3,  xm3, q1032
75    vinsertf128 m%3,  m%3, xm3, 1       ; All ACs (tmp[1] through to tmp[4])
76    addps       m%3,  m%3,  m0          ; Finally offset with DCs
77%endmacro
78
79%macro BUTTERFLIES_DC 1 ; %1 - exptab_offset
80    mulps xm0,  xm9, [exptabq + %1 + 16*0]
81    mulps xm1, xm10, [exptabq + %1 + 16*1]
82
83    haddps  xm0,  xm1
84    movhlps xm1,  xm0                   ; t[0].re, t[1].re, t[0].im, t[1].im
85
86    addps   xm0,  xm1
87    addps   xm0,  xm8
88
89    movsd [outq], xm0
90%endmacro
91
92%macro BUTTERFLIES_AC 1 ; %1 - exptab_offset
93    mulps  m0, m12, [exptabq + 64*0 + 0*mmsize + %1]
94    mulps  m1, m12, [exptabq + 64*0 + 1*mmsize + %1]
95    mulps  m2, m13, [exptabq + 64*1 + 0*mmsize + %1]
96    mulps  m3, m13, [exptabq + 64*1 + 1*mmsize + %1]
97
98    addps  m0, m0, m2
99    addps  m1, m1, m3
100    addps  m0, m0, m11
101
102    shufps m1, m1, m1, q2301
103    addps  m0, m0, m1
104
105    vextractf128 xm1, m0, 1
106
107    movlps [outq + strideq*1], xm0
108    movhps [outq + strideq*2], xm0
109    movlps [outq +  stride3q], xm1
110    movhps [outq + strideq*4], xm1
111%endmacro
112
113INIT_YMM avx
114cglobal fft15, 4, 5, 14, out, in, exptab, stride, stride5
115    shl strideq, 3
116
117    movaps xm5, [exptabq + 480 + 16*0]
118    movaps xm6, [exptabq + 480 + 16*1]
119    movaps xm7, [sign_adjust_5]
120
121    FFT5  0,  xm8, 11
122    FFT5  8,  xm9, 12
123    FFT5 16, xm10, 13
124
125%define stride3q inq
126    lea stride3q, [strideq + strideq*2]
127    lea stride5q, [strideq + strideq*4]
128
129    BUTTERFLIES_DC (8*6 + 4*0)*2*4
130    BUTTERFLIES_AC (8*0 + 0*0)*2*4
131
132    add outq, stride5q
133    BUTTERFLIES_DC (8*6 + 4*1)*2*4
134    BUTTERFLIES_AC (8*2 + 0*0)*2*4
135
136    add outq, stride5q
137    BUTTERFLIES_DC (8*6 + 4*2)*2*4
138    BUTTERFLIES_AC (8*4 + 0*0)*2*4
139
140    RET
141
142%endif ; ARCH_X86_64
143
144;*******************************************************************************************************
145;void ff_mdct15_postreindex(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
146;*******************************************************************************************************
147%macro LUT_LOAD_4D 3
148    mov      r4d, [lutq + %3q*4 +  0]
149    movsd  xmm%1, [inq +  r4q*8]
150    mov      r4d, [lutq + %3q*4 +  4]
151    movhps xmm%1, [inq +  r4q*8]
152%if cpuflag(avx2)
153    mov      r4d, [lutq + %3q*4 +  8]
154    movsd     %2, [inq +  r4q*8]
155    mov      r4d, [lutq + %3q*4 + 12]
156    movhps    %2, [inq +  r4q*8]
157    vinsertf128 %1, %1, %2, 1
158%endif
159%endmacro
160
161%macro POSTROTATE_FN 1
162cglobal mdct15_postreindex, 5, 7, 8 + cpuflag(avx2)*2, out, in, exp, lut, len8, offset_p, offset_n
163
164    xor offset_nq, offset_nq
165    lea offset_pq, [len8q*2 - %1]
166
167    movaps m7,  [sign_adjust_r]
168
169%if cpuflag(avx2)
170    movaps   m8, [perm_pos]
171    movaps   m9, [perm_neg]
172%endif
173
174.loop:
175    movups m0, [expq + offset_pq*8]     ; exp[p0].re, exp[p0].im, exp[p1].re, exp[p1].im, exp[p2].re, exp[p2].im, exp[p3].re, exp[p3].im
176    movups m1, [expq + offset_nq*8]     ; exp[n3].re, exp[n3].im, exp[n2].re, exp[n2].im, exp[n1].re, exp[n1].im, exp[n0].re, exp[n0].im
177
178    LUT_LOAD_4D m3, xm4, offset_p       ; in[p0].re, in[p0].im, in[p1].re, in[p1].im, in[p2].re, in[p2].im, in[p3].re, in[p3].im
179    LUT_LOAD_4D m4, xm5, offset_n       ; in[n3].re, in[n3].im, in[n2].re, in[n2].im, in[n1].re, in[n1].im, in[n0].re, in[n0].im
180
181    mulps  m5, m3, m0                   ; in[p].reim * exp[p].reim
182    mulps  m6, m4, m1                   ; in[n].reim * exp[n].reim
183
184    xorps  m5, m7                       ; in[p].re *= -1, in[p].im *= 1
185    xorps  m6, m7                       ; in[n].re *= -1, in[n].im *= 1
186
187    shufps m3, m3, m3, q2301            ; in[p].imre
188    shufps m4, m4, m4, q2301            ; in[n].imre
189
190    mulps  m3, m0                       ; in[p].imre * exp[p].reim
191    mulps  m4, m1                       ; in[n].imre * exp[n].reim
192
193    haddps m3, m6                       ; out[n0].im, out[n1].im, out[n3].re, out[n2].re, out[n2].im, out[n3].im, out[n1].re, out[n0].re
194    haddps m5, m4                       ; out[p0].re, out[p1].re, out[p3].im, out[p2].im, out[p2].re, out[p3].re, out[p1].im, out[p0].im
195
196%if cpuflag(avx2)
197    vpermps m3, m9, m3                  ; out[n3].im, out[n3].re, out[n2].im, out[n2].re, out[n1].im, out[n1].re, out[n0].im, out[n0].re
198    vpermps m5, m8, m5                  ; out[p0].re, out[p0].im, out[p1].re, out[p1].im, out[p2].re, out[p2].im, out[p3].re, out[p3].im
199%else
200    shufps m3, m3, m3, q0312
201    shufps m5, m5, m5, q2130
202%endif
203
204    movups [outq + offset_nq*8], m3
205    movups [outq + offset_pq*8], m5
206
207    sub offset_pq, %1
208    add offset_nq, %1
209    cmp offset_nq, offset_pq
210    jle .loop
211
212    REP_RET
213%endmacro
214
215INIT_XMM sse3
216POSTROTATE_FN 2
217
218%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
219INIT_YMM avx2
220POSTROTATE_FN 4
221%endif
222