1 /*
2  * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3  *           (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com>
4  *
5  * This file is part of lsp-plugins
6  * Created on: 09 марта 2016 г.
7  *
8  * lsp-plugins is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * any later version.
12  *
13  * lsp-plugins is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <dsp/dsp.h>
23 #include <dsp/bits.h>
24 #include <test/test.h>
25 
26 #include <core/types.h>
27 #include <core/debug.h>
28 
29 #include <dsp/arch/x86/features.h>
30 
31 #define DSP_ARCH_X86_AVX_IMPL
32 
33 #include <dsp/arch/x86/avx/xcr.h>
34 #include <dsp/arch/x86/avx/const.h>
35 
36 #include <dsp/arch/x86/avx/copy.h>
37 #include <dsp/arch/x86/avx/float.h>
38 #include <dsp/arch/x86/avx/complex.h>
39 #include <dsp/arch/x86/avx/pcomplex.h>
40 
41 #include <dsp/arch/x86/avx/pmath/op_kx.h>
42 #include <dsp/arch/x86/avx/pmath/op_vv.h>
43 #include <dsp/arch/x86/avx/pmath/fmop_kx.h>
44 #include <dsp/arch/x86/avx/pmath/fmop_vv.h>
45 #include <dsp/arch/x86/avx/pmath/abs_vv.h>
46 #include <dsp/arch/x86/avx/pmath/minmax.h>
47 
48 #include <dsp/arch/x86/avx/hmath/hsum.h>
49 #include <dsp/arch/x86/avx/hmath/hdotp.h>
50 
51 #include <dsp/arch/x86/avx/mix.h>
52 #include <dsp/arch/x86/avx/search/minmax.h>
53 
54 #include <dsp/arch/x86/avx/fft.h>
55 #include <dsp/arch/x86/avx/pfft.h>
56 #include <dsp/arch/x86/avx/fastconv.h>
57 
58 #include <dsp/arch/x86/avx/filters/static.h>
59 #include <dsp/arch/x86/avx/filters/dynamic.h>
60 #include <dsp/arch/x86/avx/filters/transform.h>
61 #include <dsp/arch/x86/avx/filters/transfer.h>
62 
63 #include <dsp/arch/x86/avx/msmatrix.h>
64 #include <dsp/arch/x86/avx/resampling.h>
65 #include <dsp/arch/x86/avx/convolution.h>
66 #include <dsp/arch/x86/avx/interpolate.h>
67 
68 #undef DSP_ARCH_X86_AVX_IMPL
69 
70 namespace avx
71 {
72     using namespace x86;
73 
74     #define EXPORT2(function, export)               { dsp::function = avx::export; TEST_EXPORT(avx::export); }
75     #define EXPORT1(function)                       EXPORT2(function, function)
76 
77     #define EXPORT2_X64(function, export)           IF_ARCH_X86_64(EXPORT2(function, export));
78     #define SUPPORT_X64(function)                   IF_ARCH_X86_64(TEST_EXPORT(avx::function))
79 
80     #define CEXPORT2(cond, function, export)    \
81     IF_ARCH_X86( \
82             TEST_EXPORT(avx::export); \
83             if (cond) \
84                 dsp::function = avx::export; \
85         );
86 
87     #define CEXPORT1(cond, export)    \
88     IF_ARCH_X86( \
89             TEST_EXPORT(avx::export); \
90             if (cond) \
91                 dsp::export = avx::export; \
92         );
93 
94     #define CEXPORT2_X64(cond, function, export)    \
95         IF_ARCH_X86_64( \
96                 TEST_EXPORT(avx::export); \
97                 if (cond) \
98                     dsp::function = avx::export; \
99             );
100 
101     #define CEXPORT1_X64(cond, export)    \
102         IF_ARCH_X86_64( \
103                 TEST_EXPORT(avx::export); \
104                 if (cond) \
105                     dsp::export = avx::export; \
106             );
107 
dsp_init(const cpu_features_t * f)108     void dsp_init(const cpu_features_t *f)
109     {
110         if (!(f->features & CPU_OPTION_AVX))
111             return;
112 
113         lsp_trace("Optimizing DSP for AVX instruction set");
114 
115         TEST_EXPORT(avx::copy);
116 
117         // This routine sucks on AMD Bulldozer processor family but is pretty great on Intel
118         // Not tested on AMD Processors above Bulldozer family
119         bool favx   = feature_check(f, FEAT_FAST_AVX);
120         bool ffma   = favx && feature_check(f, FEAT_FAST_FMA3);
121 
122         CEXPORT2_X64(favx, reverse1, reverse1);
123         CEXPORT2_X64(favx, reverse2, reverse2);
124 
125         CEXPORT1(favx, limit1);
126         CEXPORT1(favx, limit2);
127         CEXPORT1(favx, sanitize1);
128         CEXPORT1(favx, sanitize2);
129 
130         // Conditional export, depending on fast AVX implementation
131         CEXPORT1(favx, add_k2);
132         CEXPORT1(favx, sub_k2);
133         CEXPORT1(favx, rsub_k2);
134         CEXPORT1(favx, mul_k2);
135         CEXPORT1(favx, div_k2);
136         CEXPORT1(favx, rdiv_k2);
137         CEXPORT1(favx, mod_k2);
138         CEXPORT1(favx, rmod_k2);
139 
140         CEXPORT1(favx, add_k3);
141         CEXPORT1(favx, sub_k3);
142         CEXPORT1(favx, rsub_k3);
143         CEXPORT1(favx, mul_k3);
144         CEXPORT1(favx, div_k3);
145         CEXPORT1(favx, rdiv_k3);
146         CEXPORT1(favx, mod_k3);
147         CEXPORT1(favx, rmod_k3);
148 
149         CEXPORT1(favx, add2);
150         CEXPORT1(favx, sub2);
151         CEXPORT1(favx, rsub2);
152         CEXPORT1(favx, mul2);
153         CEXPORT1(favx, div2);
154         CEXPORT1(favx, rdiv2);
155         CEXPORT1(favx, mod2);
156         CEXPORT1(favx, rmod2);
157 
158         CEXPORT1(favx, add3);
159         CEXPORT1(favx, sub3);
160         CEXPORT1(favx, mul3);
161         CEXPORT1(favx, div3);
162         CEXPORT1(favx, mod3);
163 
164         CEXPORT1(favx, pmin2);
165         CEXPORT1(favx, pmax2);
166         CEXPORT1(favx, psmin2);
167         CEXPORT1(favx, psmax2);
168         CEXPORT1(favx, pamin2);
169         CEXPORT1(favx, pamax2);
170         CEXPORT1(favx, pmin3);
171         CEXPORT1(favx, pmax3);
172         CEXPORT1(favx, psmin3);
173         CEXPORT1(favx, psmax3);
174         CEXPORT1(favx, pamin3);
175         CEXPORT1(favx, pamax3);
176 
177         CEXPORT1(favx, fmadd_k3);
178         CEXPORT1(favx, fmsub_k3);
179         CEXPORT1(favx, fmrsub_k3);
180         CEXPORT1(favx, fmmul_k3);
181         CEXPORT1(favx, fmdiv_k3);
182         CEXPORT1(favx, fmrdiv_k3);
183         CEXPORT1(favx, fmmod_k3);
184         CEXPORT1(favx, fmrmod_k3);
185 
186         CEXPORT1(favx, fmadd_k4);
187         CEXPORT1(favx, fmsub_k4);
188         CEXPORT1(favx, fmrsub_k4);
189         CEXPORT1(favx, fmmul_k4);
190         CEXPORT1(favx, fmdiv_k4);
191         CEXPORT1(favx, fmrdiv_k4);
192         CEXPORT1(favx, fmmod_k4);
193         CEXPORT1(favx, fmrmod_k4);
194 
195         CEXPORT1(favx, fmadd3);
196         CEXPORT1(favx, fmsub3);
197         CEXPORT1(favx, fmrsub3);
198         CEXPORT1(favx, fmmul3);
199         CEXPORT1(favx, fmdiv3);
200         CEXPORT1(favx, fmrdiv3);
201         CEXPORT1(favx, fmmod3);
202         CEXPORT1(favx, fmrmod3);
203 
204         CEXPORT1(favx, fmadd4);
205         CEXPORT1(favx, fmsub4);
206         CEXPORT1(favx, fmrsub4);
207         CEXPORT1(favx, fmmul4);
208         CEXPORT1(favx, fmdiv4);
209         CEXPORT1(favx, fmrdiv4);
210         CEXPORT1(favx, fmmod4);
211         CEXPORT1(favx, fmrmod4);
212 
213         CEXPORT2_X64(favx, abs_add2, x64_abs_add2);
214         CEXPORT2_X64(favx, abs_sub2, x64_abs_sub2);
215         CEXPORT2_X64(favx, abs_rsub2, x64_abs_rsub2);
216         CEXPORT2_X64(favx, abs_mul2, x64_abs_mul2);
217         CEXPORT2_X64(favx, abs_div2, x64_abs_div2);
218         CEXPORT2_X64(favx, abs_rdiv2, x64_abs_rdiv2);
219 
220         CEXPORT2_X64(favx, abs_add3, x64_abs_add3);
221         CEXPORT2_X64(favx, abs_sub3, x64_abs_sub3);
222         CEXPORT2_X64(favx, abs_rsub3, x64_abs_rsub3);
223         CEXPORT2_X64(favx, abs_mul3, x64_abs_mul3);
224         CEXPORT2_X64(favx, abs_div3, x64_abs_div3);
225         CEXPORT2_X64(favx, abs_rdiv3, x64_abs_rdiv3);
226 
227         CEXPORT2_X64(favx, abs1, x64_abs1);
228         CEXPORT2_X64(favx, abs2, x64_abs2);
229 
230         CEXPORT1(favx, complex_mul2);
231         CEXPORT1(favx, complex_mul3);
232         CEXPORT1(favx, complex_div2);
233         CEXPORT1(favx, complex_rdiv2);
234         CEXPORT1(favx, complex_div3);
235         CEXPORT1(favx, complex_mod);
236         CEXPORT1(favx, complex_rcp1);
237         CEXPORT1(favx, complex_rcp2);
238 
239         CEXPORT1(favx, pcomplex_mul2);
240         CEXPORT1(favx, pcomplex_mul3);
241         CEXPORT1(favx, pcomplex_div2);
242         CEXPORT1(favx, pcomplex_rdiv2);
243         CEXPORT1(favx, pcomplex_div3);
244         CEXPORT1(favx, pcomplex_mod);
245         CEXPORT1(favx, pcomplex_rcp1);
246         CEXPORT1(favx, pcomplex_rcp2);
247 
248         CEXPORT1(favx, biquad_process_x1);
249         CEXPORT1(favx, biquad_process_x2);
250         CEXPORT1(favx, biquad_process_x4);
251         EXPORT2_X64(biquad_process_x8, x64_biquad_process_x8);
252 
253         CEXPORT1(favx, dyn_biquad_process_x1);
254         CEXPORT1(favx, dyn_biquad_process_x2);
255         CEXPORT1(favx, dyn_biquad_process_x4);
256         EXPORT2_X64(dyn_biquad_process_x8, x64_dyn_biquad_process_x8);
257 
258         CEXPORT1(favx, bilinear_transform_x1);
259         CEXPORT1(favx, bilinear_transform_x2);
260         CEXPORT1(favx, bilinear_transform_x4);
261         CEXPORT2_X64(favx, bilinear_transform_x8, x64_bilinear_transform_x8);
262 
263         CEXPORT1(favx, h_sum);
264         CEXPORT1(favx, h_sqr_sum);
265         CEXPORT1(favx, h_abs_sum);
266 
267         CEXPORT1(favx, h_dotp);
268         CEXPORT1(favx, h_sqr_dotp);
269         CEXPORT1(favx, h_abs_dotp);
270 
271         CEXPORT1(favx, mix2);
272         CEXPORT1(favx, mix_copy2);
273         CEXPORT1(favx, mix_add2);
274         CEXPORT1(favx, mix3);
275         CEXPORT1(favx, mix_copy3);
276         CEXPORT1(favx, mix_add3);
277         CEXPORT1(favx, mix4);
278         CEXPORT1(favx, mix_copy4);
279         CEXPORT1(favx, mix_add4);
280 
281         CEXPORT1(favx, min);
282         CEXPORT1(favx, max);
283         CEXPORT1(favx, minmax);
284         CEXPORT1(favx, abs_min);
285         CEXPORT1(favx, abs_max);
286         CEXPORT1(favx, abs_minmax);
287 
288         CEXPORT1(favx, lr_to_ms);
289         CEXPORT1(favx, lr_to_mid);
290         CEXPORT1(favx, lr_to_side);
291         CEXPORT1(favx, ms_to_lr);
292         CEXPORT1(favx, ms_to_left);
293         CEXPORT1(favx, ms_to_right);
294 
295         CEXPORT1(favx, direct_fft);
296         CEXPORT1(favx, reverse_fft);
297         CEXPORT1(favx, normalize_fft2);
298         CEXPORT1(favx, normalize_fft3);
299 
300         CEXPORT1(favx, packed_direct_fft);
301         CEXPORT1(favx, packed_reverse_fft);
302 
303         CEXPORT1(favx, fastconv_parse);
304         CEXPORT1(favx, fastconv_restore);
305         CEXPORT1(favx, fastconv_apply);
306         CEXPORT1(favx, fastconv_parse_apply);
307 
308         CEXPORT1(favx, filter_transfer_calc_ri);
309         CEXPORT1(favx, filter_transfer_apply_ri);
310         CEXPORT1(favx, filter_transfer_calc_pc);
311         CEXPORT1(favx, filter_transfer_apply_pc);
312 
313         CEXPORT1(favx, lanczos_resample_2x2);
314         CEXPORT1(favx, lanczos_resample_2x3);
315         CEXPORT1(favx, lanczos_resample_2x4);
316         CEXPORT1(favx, lanczos_resample_3x2);
317         CEXPORT1(favx, lanczos_resample_3x3);
318         CEXPORT1(favx, lanczos_resample_3x4);
319         CEXPORT1(favx, lanczos_resample_4x2);
320         CEXPORT1(favx, lanczos_resample_4x3);
321         CEXPORT1(favx, lanczos_resample_4x4);
322         CEXPORT1(favx, lanczos_resample_6x2);
323         CEXPORT1(favx, lanczos_resample_6x3);
324         CEXPORT1(favx, lanczos_resample_6x4);
325         CEXPORT1(favx, lanczos_resample_8x2);
326         CEXPORT1(favx, lanczos_resample_8x3);
327         CEXPORT1(favx, lanczos_resample_8x4);
328 
329         CEXPORT1(favx, downsample_2x);
330         CEXPORT1(favx, downsample_3x);
331         CEXPORT1(favx, downsample_4x);
332         CEXPORT1(favx, downsample_6x);
333         CEXPORT1(favx, downsample_8x);
334 
335         CEXPORT1(favx, convolve);
336 
337         CEXPORT1(favx, lin_inter_set);
338         CEXPORT1(favx, lin_inter_mul2);
339         CEXPORT1(favx, lin_inter_mul3);
340         CEXPORT1(favx, lin_inter_fmadd2);
341         CEXPORT1(favx, lin_inter_frmadd2);
342         CEXPORT1(favx, lin_inter_fmadd3);
343 
344         // FMA3 support?
345         if (f->features & CPU_OPTION_FMA3)
346         {
347             lsp_trace("Optimizing DSP for FMA3 instruction set");
348 
349             // Conditional export, depending on fast AVX implementation
350             CEXPORT2(favx, mod2, mod2_fma3);
351             CEXPORT2(favx, rmod2, rmod2_fma3);
352 
353             CEXPORT2(favx, mod3, mod3_fma3);
354 
355             CEXPORT2(favx, mod_k2, mod_k2_fma3);
356             CEXPORT2(favx, rmod_k2, rmod_k2_fma3);
357 
358             CEXPORT2(favx, mod_k3, mod_k3_fma3);
359             CEXPORT2(favx, rmod_k3, rmod_k3_fma3);
360 
361             CEXPORT2(favx, fmadd_k3, fmadd_k3_fma3);
362             CEXPORT2(favx, fmsub_k3, fmsub_k3_fma3);
363             CEXPORT2(favx, fmrsub_k3, fmrsub_k3_fma3);
364             CEXPORT2(favx, fmmod_k3, fmmod_k3_fma3);
365             CEXPORT2(favx, fmrmod_k3, fmrmod_k3_fma3);
366 
367             CEXPORT2(favx, fmadd_k4, fmadd_k4_fma3);
368             CEXPORT2(favx, fmsub_k4, fmsub_k4_fma3);
369             CEXPORT2(favx, fmrsub_k4, fmrsub_k4_fma3);
370             CEXPORT2(favx, fmmod_k4, fmmod_k4_fma3);
371             CEXPORT2(favx, fmrmod_k4, fmrmod_k4_fma3);
372 
373             CEXPORT2(favx, fmadd3, fmadd3_fma3);
374             CEXPORT2(favx, fmsub3, fmsub3_fma3);
375             CEXPORT2(favx, fmrsub3, fmrsub3_fma3);
376             CEXPORT2(favx, fmmod3, fmmod3_fma3);
377             CEXPORT2(favx, fmrmod3, fmrmod3_fma3);
378 
379             CEXPORT2(favx, fmadd4, fmadd4_fma3);
380             CEXPORT2(favx, fmsub4, fmsub4_fma3);
381             CEXPORT2(favx, fmrsub4, fmrsub4_fma3);
382             CEXPORT2(favx, fmmod4, fmmod4_fma3);
383             CEXPORT2(favx, fmrmod4, fmrmod4_fma3);
384 
385             CEXPORT2(favx, complex_mul2, complex_mul2_fma3);
386             CEXPORT2(favx, complex_mul3, complex_mul3_fma3);
387             CEXPORT2(favx, complex_div2, complex_div2_fma3);
388             CEXPORT2(favx, complex_rdiv2, complex_rdiv2_fma3);
389             CEXPORT2(favx, complex_div3, complex_div3_fma3);
390             CEXPORT2(favx, complex_mod, complex_mod_fma3);
391             CEXPORT2(favx, complex_rcp1, complex_rcp1_fma3);
392             CEXPORT2(favx, complex_rcp2, complex_rcp2_fma3);
393 
394             CEXPORT2(favx, pcomplex_mul2, pcomplex_mul2_fma3);
395             CEXPORT2(favx, pcomplex_mul3, pcomplex_mul3_fma3);
396             CEXPORT2(favx, pcomplex_div2, pcomplex_div2_fma3);
397             CEXPORT2(favx, pcomplex_rdiv2, pcomplex_rdiv2_fma3);
398             CEXPORT2(favx, pcomplex_div3, pcomplex_div3_fma3);
399 
400             CEXPORT2(favx, h_sqr_sum, h_sqr_sum_fma3);
401 //            CEXPORT2(favx, h_dotp_sum, h_dotp_sum_fma3);
402 
403             CEXPORT2(favx, direct_fft, direct_fft_fma3);
404             CEXPORT2(favx, reverse_fft, reverse_fft_fma3);
405             CEXPORT2(favx, packed_direct_fft, packed_direct_fft_fma3);
406             CEXPORT2(favx, packed_reverse_fft, packed_reverse_fft_fma3);
407 
408             CEXPORT2(favx, fastconv_parse, fastconv_parse_fma3);
409             CEXPORT2(favx, fastconv_restore, fastconv_restore_fma3);
410             CEXPORT2(favx, fastconv_apply, fastconv_apply_fma3);
411             CEXPORT2(favx, fastconv_parse_apply, fastconv_parse_apply_fma3);
412 
413             CEXPORT2(favx, filter_transfer_calc_ri, filter_transfer_calc_ri_fma3);
414             CEXPORT2(favx, filter_transfer_apply_ri, filter_transfer_apply_ri_fma3);
415             CEXPORT2(favx, filter_transfer_calc_pc, filter_transfer_calc_pc_fma3);
416             CEXPORT2(favx, filter_transfer_apply_pc, filter_transfer_apply_pc_fma3);
417 
418             CEXPORT2(favx, convolve, convolve_fma3);
419 
420 
421             CEXPORT2(favx, biquad_process_x1, biquad_process_x1_fma3);
422             CEXPORT2(favx, biquad_process_x2, biquad_process_x2_fma3);
423             CEXPORT2(favx, biquad_process_x4, biquad_process_x4_fma3);
424             CEXPORT2(ffma, biquad_process_x8, biquad_process_x8_fma3);
425 
426             CEXPORT2(ffma, dyn_biquad_process_x1, dyn_biquad_process_x1_fma3);
427             CEXPORT2(favx, dyn_biquad_process_x2, dyn_biquad_process_x2_fma3);
428             CEXPORT2(favx, dyn_biquad_process_x4, dyn_biquad_process_x4_fma3);
429             CEXPORT2(ffma, dyn_biquad_process_x8, dyn_biquad_process_x8_fma3);
430         }
431     }
432 
433     #undef EXPORT1
434     #undef EXPORT2
435 }
436 
437