1 /*
2  * SIMD-optimized MP3 decoding functions
3  * Copyright (c) 2010 Vitor Sessak
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/attributes.h"
23 #include "libavutil/cpu.h"
24 #include "libavutil/internal.h"
25 #include "libavutil/x86/asm.h"
26 #include "libavutil/x86/cpu.h"
27 #include "libavcodec/mpegaudiodsp.h"
28 
29 #define DECL(CPU)\
30 static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
31 void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
32 
33 #if HAVE_X86ASM
34 #if ARCH_X86_32
35 DECL(sse)
36 #endif
37 DECL(sse2)
38 DECL(sse3)
39 DECL(ssse3)
40 DECL(avx)
41 #endif /* HAVE_X86ASM */
42 
43 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
44                                float *tmpbuf);
45 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
46                                float *tmpbuf);
47 
48 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
49 
50 #if HAVE_6REGS && HAVE_SSE_INLINE
51 
52 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
53 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
54 
55 #define SUM8(op, sum, w, p)               \
56 {                                         \
57     op(sum, (w)[0 * 64], (p)[0 * 64]);    \
58     op(sum, (w)[1 * 64], (p)[1 * 64]);    \
59     op(sum, (w)[2 * 64], (p)[2 * 64]);    \
60     op(sum, (w)[3 * 64], (p)[3 * 64]);    \
61     op(sum, (w)[4 * 64], (p)[4 * 64]);    \
62     op(sum, (w)[5 * 64], (p)[5 * 64]);    \
63     op(sum, (w)[6 * 64], (p)[6 * 64]);    \
64     op(sum, (w)[7 * 64], (p)[7 * 64]);    \
65 }
66 
apply_window(const float * buf,const float * win1,const float * win2,float * sum1,float * sum2,int len)67 static void apply_window(const float *buf, const float *win1,
68                          const float *win2, float *sum1, float *sum2, int len)
69 {
70     x86_reg count = - 4*len;
71     const float *win1a = win1+len;
72     const float *win2a = win2+len;
73     const float *bufa  = buf+len;
74     float *sum1a = sum1+len;
75     float *sum2a = sum2+len;
76 
77 
78 #define MULT(a, b)                                 \
79     "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
80     "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
81     "mulps         %%xmm2, %%xmm1           \n\t"  \
82     "subps         %%xmm1, %%xmm0           \n\t"  \
83     "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
84     "subps         %%xmm2, %%xmm4           \n\t"  \
85 
86     __asm__ volatile(
87             "1:                                   \n\t"
88             "xorps       %%xmm0, %%xmm0           \n\t"
89             "xorps       %%xmm4, %%xmm4           \n\t"
90 
91             MULT(   0,   0)
92             MULT( 256,  64)
93             MULT( 512, 128)
94             MULT( 768, 192)
95             MULT(1024, 256)
96             MULT(1280, 320)
97             MULT(1536, 384)
98             MULT(1792, 448)
99 
100             "movaps      %%xmm0, (%4,%0)          \n\t"
101             "movaps      %%xmm4, (%5,%0)          \n\t"
102             "add            $16,  %0              \n\t"
103             "jl              1b                   \n\t"
104             :"+&r"(count)
105             :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
106             );
107 
108 #undef MULT
109 }
110 
apply_window_mp3(float * in,float * win,int * unused,float * out,ptrdiff_t incr)111 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
112                              ptrdiff_t incr)
113 {
114     LOCAL_ALIGNED_16(float, suma, [17]);
115     LOCAL_ALIGNED_16(float, sumb, [17]);
116     LOCAL_ALIGNED_16(float, sumc, [17]);
117     LOCAL_ALIGNED_16(float, sumd, [17]);
118 
119     float sum;
120 
121     /* copy to avoid wrap */
122     __asm__ volatile(
123             "movaps    0(%0), %%xmm0   \n\t" \
124             "movaps   16(%0), %%xmm1   \n\t" \
125             "movaps   32(%0), %%xmm2   \n\t" \
126             "movaps   48(%0), %%xmm3   \n\t" \
127             "movaps   %%xmm0,   0(%1) \n\t" \
128             "movaps   %%xmm1,  16(%1) \n\t" \
129             "movaps   %%xmm2,  32(%1) \n\t" \
130             "movaps   %%xmm3,  48(%1) \n\t" \
131             "movaps   64(%0), %%xmm0   \n\t" \
132             "movaps   80(%0), %%xmm1   \n\t" \
133             "movaps   96(%0), %%xmm2   \n\t" \
134             "movaps  112(%0), %%xmm3   \n\t" \
135             "movaps   %%xmm0,  64(%1) \n\t" \
136             "movaps   %%xmm1,  80(%1) \n\t" \
137             "movaps   %%xmm2,  96(%1) \n\t" \
138             "movaps   %%xmm3, 112(%1) \n\t"
139             ::"r"(in), "r"(in+512)
140             :"memory"
141             );
142 
143     apply_window(in + 16, win     , win + 512, suma, sumc, 16);
144     apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
145 
146     SUM8(MACS, suma[0], win + 32, in + 48);
147 
148     sumc[ 0] = 0;
149     sumb[16] = 0;
150     sumd[16] = 0;
151 
152 #define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
153             "movups " #sumd "(%4),       %%xmm0          \n\t" \
154             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
155             "subps  " #suma "(%1),       %%xmm0          \n\t" \
156             "movaps        %%xmm0," #out1 "(%0)          \n\t" \
157 \
158             "movups " #sumc "(%3),       %%xmm0          \n\t" \
159             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
160             "addps  " #sumb "(%2),       %%xmm0          \n\t" \
161             "movaps        %%xmm0," #out2 "(%0)          \n\t"
162 
163     if (incr == 1) {
164         __asm__ volatile(
165             SUMS( 0, 48,  4, 52,  0, 112)
166             SUMS(16, 32, 20, 36, 16,  96)
167             SUMS(32, 16, 36, 20, 32,  80)
168             SUMS(48,  0, 52,  4, 48,  64)
169 
170             :"+&r"(out)
171             :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
172             :"memory"
173             );
174         out += 16*incr;
175     } else {
176         int j;
177         float *out2 = out + 32 * incr;
178         out[0  ]  = -suma[   0];
179         out += incr;
180         out2 -= incr;
181         for(j=1;j<16;j++) {
182             *out  = -suma[   j] + sumd[16-j];
183             *out2 =  sumb[16-j] + sumc[   j];
184             out  += incr;
185             out2 -= incr;
186         }
187     }
188 
189     sum = 0;
190     SUM8(MLSS, sum, win + 16 + 32, in + 32);
191     *out = sum;
192 }
193 
194 #endif /* HAVE_6REGS && HAVE_SSE_INLINE */
195 
196 #if HAVE_X86ASM
197 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
198 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
199                                int count, int switch_point, int block_type) \
200 {                                                                           \
201     int align_end = count - (count & 3);                                \
202     int j;                                                              \
203     for (j = 0; j < align_end; j+= 4) {                                 \
204         LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
205         float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
206         /* apply window & overlap with previous buffer */               \
207                                                                         \
208         /* select window */                                             \
209         ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
210         in      += 4*18;                                                \
211         buf     += 4*18;                                                \
212         out     += 4;                                                   \
213     }                                                                   \
214     for (; j < count; j++) {                                            \
215         /* apply window & overlap with previous buffer */               \
216                                                                         \
217         /* select window */                                             \
218         int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
219         float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
220                                                                         \
221         ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
222                                                                         \
223         in  += 18;                                                      \
224         buf++;                                                          \
225         out++;                                                          \
226     }                                                                   \
227 }
228 
229 #if HAVE_SSE
230 #if ARCH_X86_32
DECL_IMDCT_BLOCKS(sse,sse)231 DECL_IMDCT_BLOCKS(sse,sse)
232 #endif
233 DECL_IMDCT_BLOCKS(sse2,sse)
234 DECL_IMDCT_BLOCKS(sse3,sse)
235 DECL_IMDCT_BLOCKS(ssse3,sse)
236 #endif
237 #if HAVE_AVX_EXTERNAL
238 DECL_IMDCT_BLOCKS(avx,avx)
239 #endif
240 #endif /* HAVE_X86ASM */
241 
242 av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
243 {
244     av_unused int cpu_flags = av_get_cpu_flags();
245 
246     int i, j;
247     for (j = 0; j < 4; j++) {
248         for (i = 0; i < 40; i ++) {
249             mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
250             mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
251             mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
252             mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
253             mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
254             mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
255             mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
256             mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
257         }
258     }
259 
260 #if HAVE_6REGS && HAVE_SSE_INLINE
261     if (INLINE_SSE(cpu_flags)) {
262         s->apply_window_float = apply_window_mp3;
263     }
264 #endif /* HAVE_SSE_INLINE */
265 
266 #if HAVE_X86ASM
267 #if HAVE_SSE
268 #if ARCH_X86_32
269     if (EXTERNAL_SSE(cpu_flags)) {
270         s->imdct36_blocks_float = imdct36_blocks_sse;
271     }
272 #endif
273     if (EXTERNAL_SSE2(cpu_flags)) {
274         s->imdct36_blocks_float = imdct36_blocks_sse2;
275     }
276     if (EXTERNAL_SSE3(cpu_flags)) {
277         s->imdct36_blocks_float = imdct36_blocks_sse3;
278     }
279     if (EXTERNAL_SSSE3(cpu_flags)) {
280         s->imdct36_blocks_float = imdct36_blocks_ssse3;
281     }
282 #endif
283 #if HAVE_AVX_EXTERNAL
284     if (EXTERNAL_AVX(cpu_flags)) {
285         s->imdct36_blocks_float = imdct36_blocks_avx;
286     }
287 #endif
288 #endif /* HAVE_X86ASM */
289 }
290