1 /* 2 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/> 3 * (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com> 4 * 5 * This file is part of lsp-plugins 6 * Created on: 09 марта 2016 г. 7 * 8 * lsp-plugins is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Lesser General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * any later version. 12 * 13 * lsp-plugins is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public License 19 * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>. 20 */ 21 22 #include <dsp/dsp.h> 23 #include <dsp/bits.h> 24 #include <test/test.h> 25 26 #include <core/types.h> 27 #include <core/debug.h> 28 29 #include <dsp/arch/x86/features.h> 30 31 #define DSP_ARCH_X86_AVX_IMPL 32 33 #include <dsp/arch/x86/avx/xcr.h> 34 #include <dsp/arch/x86/avx/const.h> 35 36 #include <dsp/arch/x86/avx/copy.h> 37 #include <dsp/arch/x86/avx/float.h> 38 #include <dsp/arch/x86/avx/complex.h> 39 #include <dsp/arch/x86/avx/pcomplex.h> 40 41 #include <dsp/arch/x86/avx/pmath/op_kx.h> 42 #include <dsp/arch/x86/avx/pmath/op_vv.h> 43 #include <dsp/arch/x86/avx/pmath/fmop_kx.h> 44 #include <dsp/arch/x86/avx/pmath/fmop_vv.h> 45 #include <dsp/arch/x86/avx/pmath/abs_vv.h> 46 #include <dsp/arch/x86/avx/pmath/minmax.h> 47 48 #include <dsp/arch/x86/avx/hmath/hsum.h> 49 #include <dsp/arch/x86/avx/hmath/hdotp.h> 50 51 #include <dsp/arch/x86/avx/mix.h> 52 #include <dsp/arch/x86/avx/search/minmax.h> 53 54 #include <dsp/arch/x86/avx/fft.h> 55 #include <dsp/arch/x86/avx/pfft.h> 56 #include <dsp/arch/x86/avx/fastconv.h> 57 58 #include <dsp/arch/x86/avx/filters/static.h> 59 #include <dsp/arch/x86/avx/filters/dynamic.h> 60 #include <dsp/arch/x86/avx/filters/transform.h> 61 #include <dsp/arch/x86/avx/filters/transfer.h> 62 63 #include <dsp/arch/x86/avx/msmatrix.h> 64 #include <dsp/arch/x86/avx/resampling.h> 65 #include <dsp/arch/x86/avx/convolution.h> 66 #include <dsp/arch/x86/avx/interpolate.h> 67 68 #undef DSP_ARCH_X86_AVX_IMPL 69 70 namespace avx 71 { 72 using namespace x86; 73 74 #define EXPORT2(function, export) { dsp::function = avx::export; TEST_EXPORT(avx::export); } 75 #define EXPORT1(function) EXPORT2(function, function) 76 77 #define EXPORT2_X64(function, export) IF_ARCH_X86_64(EXPORT2(function, export)); 78 #define SUPPORT_X64(function) IF_ARCH_X86_64(TEST_EXPORT(avx::function)) 79 80 #define CEXPORT2(cond, function, export) \ 81 IF_ARCH_X86( \ 82 TEST_EXPORT(avx::export); \ 83 if (cond) \ 84 dsp::function = avx::export; \ 85 ); 86 87 #define CEXPORT1(cond, export) \ 88 IF_ARCH_X86( \ 89 TEST_EXPORT(avx::export); \ 90 if (cond) \ 91 dsp::export = avx::export; \ 92 ); 93 94 #define CEXPORT2_X64(cond, function, export) \ 95 IF_ARCH_X86_64( \ 96 TEST_EXPORT(avx::export); \ 97 if (cond) \ 98 dsp::function = avx::export; \ 99 ); 100 101 #define CEXPORT1_X64(cond, export) \ 102 IF_ARCH_X86_64( \ 103 TEST_EXPORT(avx::export); \ 104 if (cond) \ 105 dsp::export = avx::export; \ 106 ); 107 dsp_init(const cpu_features_t * f)108 void dsp_init(const cpu_features_t *f) 109 { 110 if (!(f->features & CPU_OPTION_AVX)) 111 return; 112 113 lsp_trace("Optimizing DSP for AVX instruction set"); 114 115 TEST_EXPORT(avx::copy); 116 117 // This routine sucks on AMD Bulldozer processor family but is pretty great on Intel 118 // Not tested on AMD Processors above Bulldozer family 119 bool favx = feature_check(f, FEAT_FAST_AVX); 120 bool ffma = favx && feature_check(f, FEAT_FAST_FMA3); 121 122 CEXPORT2_X64(favx, reverse1, reverse1); 123 CEXPORT2_X64(favx, reverse2, reverse2); 124 125 CEXPORT1(favx, limit1); 126 CEXPORT1(favx, limit2); 127 CEXPORT1(favx, sanitize1); 128 CEXPORT1(favx, sanitize2); 129 130 // Conditional export, depending on fast AVX implementation 131 CEXPORT1(favx, add_k2); 132 CEXPORT1(favx, sub_k2); 133 CEXPORT1(favx, rsub_k2); 134 CEXPORT1(favx, mul_k2); 135 CEXPORT1(favx, div_k2); 136 CEXPORT1(favx, rdiv_k2); 137 CEXPORT1(favx, mod_k2); 138 CEXPORT1(favx, rmod_k2); 139 140 CEXPORT1(favx, add_k3); 141 CEXPORT1(favx, sub_k3); 142 CEXPORT1(favx, rsub_k3); 143 CEXPORT1(favx, mul_k3); 144 CEXPORT1(favx, div_k3); 145 CEXPORT1(favx, rdiv_k3); 146 CEXPORT1(favx, mod_k3); 147 CEXPORT1(favx, rmod_k3); 148 149 CEXPORT1(favx, add2); 150 CEXPORT1(favx, sub2); 151 CEXPORT1(favx, rsub2); 152 CEXPORT1(favx, mul2); 153 CEXPORT1(favx, div2); 154 CEXPORT1(favx, rdiv2); 155 CEXPORT1(favx, mod2); 156 CEXPORT1(favx, rmod2); 157 158 CEXPORT1(favx, add3); 159 CEXPORT1(favx, sub3); 160 CEXPORT1(favx, mul3); 161 CEXPORT1(favx, div3); 162 CEXPORT1(favx, mod3); 163 164 CEXPORT1(favx, pmin2); 165 CEXPORT1(favx, pmax2); 166 CEXPORT1(favx, psmin2); 167 CEXPORT1(favx, psmax2); 168 CEXPORT1(favx, pamin2); 169 CEXPORT1(favx, pamax2); 170 CEXPORT1(favx, pmin3); 171 CEXPORT1(favx, pmax3); 172 CEXPORT1(favx, psmin3); 173 CEXPORT1(favx, psmax3); 174 CEXPORT1(favx, pamin3); 175 CEXPORT1(favx, pamax3); 176 177 CEXPORT1(favx, fmadd_k3); 178 CEXPORT1(favx, fmsub_k3); 179 CEXPORT1(favx, fmrsub_k3); 180 CEXPORT1(favx, fmmul_k3); 181 CEXPORT1(favx, fmdiv_k3); 182 CEXPORT1(favx, fmrdiv_k3); 183 CEXPORT1(favx, fmmod_k3); 184 CEXPORT1(favx, fmrmod_k3); 185 186 CEXPORT1(favx, fmadd_k4); 187 CEXPORT1(favx, fmsub_k4); 188 CEXPORT1(favx, fmrsub_k4); 189 CEXPORT1(favx, fmmul_k4); 190 CEXPORT1(favx, fmdiv_k4); 191 CEXPORT1(favx, fmrdiv_k4); 192 CEXPORT1(favx, fmmod_k4); 193 CEXPORT1(favx, fmrmod_k4); 194 195 CEXPORT1(favx, fmadd3); 196 CEXPORT1(favx, fmsub3); 197 CEXPORT1(favx, fmrsub3); 198 CEXPORT1(favx, fmmul3); 199 CEXPORT1(favx, fmdiv3); 200 CEXPORT1(favx, fmrdiv3); 201 CEXPORT1(favx, fmmod3); 202 CEXPORT1(favx, fmrmod3); 203 204 CEXPORT1(favx, fmadd4); 205 CEXPORT1(favx, fmsub4); 206 CEXPORT1(favx, fmrsub4); 207 CEXPORT1(favx, fmmul4); 208 CEXPORT1(favx, fmdiv4); 209 CEXPORT1(favx, fmrdiv4); 210 CEXPORT1(favx, fmmod4); 211 CEXPORT1(favx, fmrmod4); 212 213 CEXPORT2_X64(favx, abs_add2, x64_abs_add2); 214 CEXPORT2_X64(favx, abs_sub2, x64_abs_sub2); 215 CEXPORT2_X64(favx, abs_rsub2, x64_abs_rsub2); 216 CEXPORT2_X64(favx, abs_mul2, x64_abs_mul2); 217 CEXPORT2_X64(favx, abs_div2, x64_abs_div2); 218 CEXPORT2_X64(favx, abs_rdiv2, x64_abs_rdiv2); 219 220 CEXPORT2_X64(favx, abs_add3, x64_abs_add3); 221 CEXPORT2_X64(favx, abs_sub3, x64_abs_sub3); 222 CEXPORT2_X64(favx, abs_rsub3, x64_abs_rsub3); 223 CEXPORT2_X64(favx, abs_mul3, x64_abs_mul3); 224 CEXPORT2_X64(favx, abs_div3, x64_abs_div3); 225 CEXPORT2_X64(favx, abs_rdiv3, x64_abs_rdiv3); 226 227 CEXPORT2_X64(favx, abs1, x64_abs1); 228 CEXPORT2_X64(favx, abs2, x64_abs2); 229 230 CEXPORT1(favx, complex_mul2); 231 CEXPORT1(favx, complex_mul3); 232 CEXPORT1(favx, complex_div2); 233 CEXPORT1(favx, complex_rdiv2); 234 CEXPORT1(favx, complex_div3); 235 CEXPORT1(favx, complex_mod); 236 CEXPORT1(favx, complex_rcp1); 237 CEXPORT1(favx, complex_rcp2); 238 239 CEXPORT1(favx, pcomplex_mul2); 240 CEXPORT1(favx, pcomplex_mul3); 241 CEXPORT1(favx, pcomplex_div2); 242 CEXPORT1(favx, pcomplex_rdiv2); 243 CEXPORT1(favx, pcomplex_div3); 244 CEXPORT1(favx, pcomplex_mod); 245 CEXPORT1(favx, pcomplex_rcp1); 246 CEXPORT1(favx, pcomplex_rcp2); 247 248 CEXPORT1(favx, biquad_process_x1); 249 CEXPORT1(favx, biquad_process_x2); 250 CEXPORT1(favx, biquad_process_x4); 251 EXPORT2_X64(biquad_process_x8, x64_biquad_process_x8); 252 253 CEXPORT1(favx, dyn_biquad_process_x1); 254 CEXPORT1(favx, dyn_biquad_process_x2); 255 CEXPORT1(favx, dyn_biquad_process_x4); 256 EXPORT2_X64(dyn_biquad_process_x8, x64_dyn_biquad_process_x8); 257 258 CEXPORT1(favx, bilinear_transform_x1); 259 CEXPORT1(favx, bilinear_transform_x2); 260 CEXPORT1(favx, bilinear_transform_x4); 261 CEXPORT2_X64(favx, bilinear_transform_x8, x64_bilinear_transform_x8); 262 263 CEXPORT1(favx, h_sum); 264 CEXPORT1(favx, h_sqr_sum); 265 CEXPORT1(favx, h_abs_sum); 266 267 CEXPORT1(favx, h_dotp); 268 CEXPORT1(favx, h_sqr_dotp); 269 CEXPORT1(favx, h_abs_dotp); 270 271 CEXPORT1(favx, mix2); 272 CEXPORT1(favx, mix_copy2); 273 CEXPORT1(favx, mix_add2); 274 CEXPORT1(favx, mix3); 275 CEXPORT1(favx, mix_copy3); 276 CEXPORT1(favx, mix_add3); 277 CEXPORT1(favx, mix4); 278 CEXPORT1(favx, mix_copy4); 279 CEXPORT1(favx, mix_add4); 280 281 CEXPORT1(favx, min); 282 CEXPORT1(favx, max); 283 CEXPORT1(favx, minmax); 284 CEXPORT1(favx, abs_min); 285 CEXPORT1(favx, abs_max); 286 CEXPORT1(favx, abs_minmax); 287 288 CEXPORT1(favx, lr_to_ms); 289 CEXPORT1(favx, lr_to_mid); 290 CEXPORT1(favx, lr_to_side); 291 CEXPORT1(favx, ms_to_lr); 292 CEXPORT1(favx, ms_to_left); 293 CEXPORT1(favx, ms_to_right); 294 295 CEXPORT1(favx, direct_fft); 296 CEXPORT1(favx, reverse_fft); 297 CEXPORT1(favx, normalize_fft2); 298 CEXPORT1(favx, normalize_fft3); 299 300 CEXPORT1(favx, packed_direct_fft); 301 CEXPORT1(favx, packed_reverse_fft); 302 303 CEXPORT1(favx, fastconv_parse); 304 CEXPORT1(favx, fastconv_restore); 305 CEXPORT1(favx, fastconv_apply); 306 CEXPORT1(favx, fastconv_parse_apply); 307 308 CEXPORT1(favx, filter_transfer_calc_ri); 309 CEXPORT1(favx, filter_transfer_apply_ri); 310 CEXPORT1(favx, filter_transfer_calc_pc); 311 CEXPORT1(favx, filter_transfer_apply_pc); 312 313 CEXPORT1(favx, lanczos_resample_2x2); 314 CEXPORT1(favx, lanczos_resample_2x3); 315 CEXPORT1(favx, lanczos_resample_2x4); 316 CEXPORT1(favx, lanczos_resample_3x2); 317 CEXPORT1(favx, lanczos_resample_3x3); 318 CEXPORT1(favx, lanczos_resample_3x4); 319 CEXPORT1(favx, lanczos_resample_4x2); 320 CEXPORT1(favx, lanczos_resample_4x3); 321 CEXPORT1(favx, lanczos_resample_4x4); 322 CEXPORT1(favx, lanczos_resample_6x2); 323 CEXPORT1(favx, lanczos_resample_6x3); 324 CEXPORT1(favx, lanczos_resample_6x4); 325 CEXPORT1(favx, lanczos_resample_8x2); 326 CEXPORT1(favx, lanczos_resample_8x3); 327 CEXPORT1(favx, lanczos_resample_8x4); 328 329 CEXPORT1(favx, downsample_2x); 330 CEXPORT1(favx, downsample_3x); 331 CEXPORT1(favx, downsample_4x); 332 CEXPORT1(favx, downsample_6x); 333 CEXPORT1(favx, downsample_8x); 334 335 CEXPORT1(favx, convolve); 336 337 CEXPORT1(favx, lin_inter_set); 338 CEXPORT1(favx, lin_inter_mul2); 339 CEXPORT1(favx, lin_inter_mul3); 340 CEXPORT1(favx, lin_inter_fmadd2); 341 CEXPORT1(favx, lin_inter_frmadd2); 342 CEXPORT1(favx, lin_inter_fmadd3); 343 344 // FMA3 support? 345 if (f->features & CPU_OPTION_FMA3) 346 { 347 lsp_trace("Optimizing DSP for FMA3 instruction set"); 348 349 // Conditional export, depending on fast AVX implementation 350 CEXPORT2(favx, mod2, mod2_fma3); 351 CEXPORT2(favx, rmod2, rmod2_fma3); 352 353 CEXPORT2(favx, mod3, mod3_fma3); 354 355 CEXPORT2(favx, mod_k2, mod_k2_fma3); 356 CEXPORT2(favx, rmod_k2, rmod_k2_fma3); 357 358 CEXPORT2(favx, mod_k3, mod_k3_fma3); 359 CEXPORT2(favx, rmod_k3, rmod_k3_fma3); 360 361 CEXPORT2(favx, fmadd_k3, fmadd_k3_fma3); 362 CEXPORT2(favx, fmsub_k3, fmsub_k3_fma3); 363 CEXPORT2(favx, fmrsub_k3, fmrsub_k3_fma3); 364 CEXPORT2(favx, fmmod_k3, fmmod_k3_fma3); 365 CEXPORT2(favx, fmrmod_k3, fmrmod_k3_fma3); 366 367 CEXPORT2(favx, fmadd_k4, fmadd_k4_fma3); 368 CEXPORT2(favx, fmsub_k4, fmsub_k4_fma3); 369 CEXPORT2(favx, fmrsub_k4, fmrsub_k4_fma3); 370 CEXPORT2(favx, fmmod_k4, fmmod_k4_fma3); 371 CEXPORT2(favx, fmrmod_k4, fmrmod_k4_fma3); 372 373 CEXPORT2(favx, fmadd3, fmadd3_fma3); 374 CEXPORT2(favx, fmsub3, fmsub3_fma3); 375 CEXPORT2(favx, fmrsub3, fmrsub3_fma3); 376 CEXPORT2(favx, fmmod3, fmmod3_fma3); 377 CEXPORT2(favx, fmrmod3, fmrmod3_fma3); 378 379 CEXPORT2(favx, fmadd4, fmadd4_fma3); 380 CEXPORT2(favx, fmsub4, fmsub4_fma3); 381 CEXPORT2(favx, fmrsub4, fmrsub4_fma3); 382 CEXPORT2(favx, fmmod4, fmmod4_fma3); 383 CEXPORT2(favx, fmrmod4, fmrmod4_fma3); 384 385 CEXPORT2(favx, complex_mul2, complex_mul2_fma3); 386 CEXPORT2(favx, complex_mul3, complex_mul3_fma3); 387 CEXPORT2(favx, complex_div2, complex_div2_fma3); 388 CEXPORT2(favx, complex_rdiv2, complex_rdiv2_fma3); 389 CEXPORT2(favx, complex_div3, complex_div3_fma3); 390 CEXPORT2(favx, complex_mod, complex_mod_fma3); 391 CEXPORT2(favx, complex_rcp1, complex_rcp1_fma3); 392 CEXPORT2(favx, complex_rcp2, complex_rcp2_fma3); 393 394 CEXPORT2(favx, pcomplex_mul2, pcomplex_mul2_fma3); 395 CEXPORT2(favx, pcomplex_mul3, pcomplex_mul3_fma3); 396 CEXPORT2(favx, pcomplex_div2, pcomplex_div2_fma3); 397 CEXPORT2(favx, pcomplex_rdiv2, pcomplex_rdiv2_fma3); 398 CEXPORT2(favx, pcomplex_div3, pcomplex_div3_fma3); 399 400 CEXPORT2(favx, h_sqr_sum, h_sqr_sum_fma3); 401 // CEXPORT2(favx, h_dotp_sum, h_dotp_sum_fma3); 402 403 CEXPORT2(favx, direct_fft, direct_fft_fma3); 404 CEXPORT2(favx, reverse_fft, reverse_fft_fma3); 405 CEXPORT2(favx, packed_direct_fft, packed_direct_fft_fma3); 406 CEXPORT2(favx, packed_reverse_fft, packed_reverse_fft_fma3); 407 408 CEXPORT2(favx, fastconv_parse, fastconv_parse_fma3); 409 CEXPORT2(favx, fastconv_restore, fastconv_restore_fma3); 410 CEXPORT2(favx, fastconv_apply, fastconv_apply_fma3); 411 CEXPORT2(favx, fastconv_parse_apply, fastconv_parse_apply_fma3); 412 413 CEXPORT2(favx, filter_transfer_calc_ri, filter_transfer_calc_ri_fma3); 414 CEXPORT2(favx, filter_transfer_apply_ri, filter_transfer_apply_ri_fma3); 415 CEXPORT2(favx, filter_transfer_calc_pc, filter_transfer_calc_pc_fma3); 416 CEXPORT2(favx, filter_transfer_apply_pc, filter_transfer_apply_pc_fma3); 417 418 CEXPORT2(favx, convolve, convolve_fma3); 419 420 421 CEXPORT2(favx, biquad_process_x1, biquad_process_x1_fma3); 422 CEXPORT2(favx, biquad_process_x2, biquad_process_x2_fma3); 423 CEXPORT2(favx, biquad_process_x4, biquad_process_x4_fma3); 424 CEXPORT2(ffma, biquad_process_x8, biquad_process_x8_fma3); 425 426 CEXPORT2(ffma, dyn_biquad_process_x1, dyn_biquad_process_x1_fma3); 427 CEXPORT2(favx, dyn_biquad_process_x2, dyn_biquad_process_x2_fma3); 428 CEXPORT2(favx, dyn_biquad_process_x4, dyn_biquad_process_x4_fma3); 429 CEXPORT2(ffma, dyn_biquad_process_x8, dyn_biquad_process_x8_fma3); 430 } 431 } 432 433 #undef EXPORT1 434 #undef EXPORT2 435 } 436 437