1 /* 2 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/> 3 * (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com> 4 * 5 * This file is part of lsp-plugins 6 * Created on: 23 дек. 2019 г. 7 * 8 * lsp-plugins is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Lesser General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * any later version. 12 * 13 * lsp-plugins is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public License 19 * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>. 20 */ 21 22 #ifndef DSP_ARCH_X86_AVX2_FFT_NORMALIZE_H_ 23 #define DSP_ARCH_X86_AVX2_FFT_NORMALIZE_H_ 24 25 #ifndef DSP_ARCH_X86_AVX2_IMPL 26 #error "This header should not be included directly" 27 #endif /* DSP_ARCH_X86_AVX_IMPL */ 28 29 namespace avx2 30 { normalize_fft3(float * dre,float * dim,const float * re,const float * im,size_t rank)31 void normalize_fft3(float *dre, float *dim, const float *re, const float *im, size_t rank) 32 { 33 IF_ARCH_X86( 34 float k = 1.0f/(1 << rank); 35 size_t count = 1 << rank, off = 0; 36 ); 37 ARCH_X86_ASM( 38 // x16 blocks 39 __ASM_EMIT ("vbroadcastss %%xmm0, %%ymm0") // ymm0 = k 40 __ASM_EMIT32("subl $16, %[count]") 41 __ASM_EMIT64("sub $16, %[count]") 42 __ASM_EMIT ("vmovaps %%ymm0, %%ymm1") 43 __ASM_EMIT ("jb 2f") 44 __ASM_EMIT ("1:") 45 __ASM_EMIT ("vmulps 0x00(%[s_re], %[off]), %%ymm0, %%ymm4") 46 __ASM_EMIT ("vmulps 0x20(%[s_re], %[off]), %%ymm1, %%ymm5") 47 __ASM_EMIT ("vmulps 0x00(%[s_im], %[off]), %%ymm0, %%ymm6") 48 __ASM_EMIT ("vmulps 0x20(%[s_im], %[off]), %%ymm1, %%ymm7") 49 __ASM_EMIT ("vmovups %%ymm4, 0x00(%[d_re], %[off])") 50 __ASM_EMIT ("vmovups %%ymm5, 0x20(%[d_re], %[off])") 51 __ASM_EMIT ("vmovups %%ymm6, 0x00(%[d_im], %[off])") 52 __ASM_EMIT ("vmovups %%ymm7, 0x20(%[d_im], %[off])") 53 __ASM_EMIT ("add $0x40, %[off]") 54 __ASM_EMIT32("subl $16, %[count]") 55 __ASM_EMIT64("sub $16, %[count]") 56 __ASM_EMIT ("jae 1b") 57 __ASM_EMIT ("2:") 58 // x8 block 59 __ASM_EMIT32("addl $8, %[count]") 60 __ASM_EMIT64("add $8, %[count]") 61 __ASM_EMIT ("jl 4f") 62 __ASM_EMIT ("vmulps 0x00(%[s_re], %[off]), %%ymm0, %%ymm4") 63 __ASM_EMIT ("vmulps 0x00(%[s_im], %[off]), %%ymm1, %%ymm6") 64 __ASM_EMIT ("vmovups %%ymm4, 0x00(%[d_re], %[off])") 65 __ASM_EMIT ("vmovups %%ymm6, 0x00(%[d_im], %[off])") 66 __ASM_EMIT ("4:") 67 : [off] "+r" (off), [count] __ASM_ARG_RW(count), 68 [k] "+Yz" (k) 69 : [s_re] "r" (re), [s_im] "r" (im), 70 [d_re] "r" (dre), [d_im] "r" (dim) 71 : "cc", "memory", 72 "%xmm1", 73 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 74 ); 75 } 76 normalize_fft2(float * re,float * im,size_t rank)77 void normalize_fft2(float *re, float *im, size_t rank) 78 { 79 IF_ARCH_X86( 80 float k = 1.0f/(1 << rank); 81 size_t count = 1 << rank, off = 0; 82 ); 83 ARCH_X86_ASM( 84 // x16 blocks 85 __ASM_EMIT ("vbroadcastss %%xmm0, %%ymm0") // ymm0 = k 86 __ASM_EMIT32("subl $16, %[count]") 87 __ASM_EMIT64("sub $16, %[count]") 88 __ASM_EMIT ("vmovaps %%ymm0, %%ymm1") 89 __ASM_EMIT ("jb 2f") 90 __ASM_EMIT ("1:") 91 __ASM_EMIT ("vmulps 0x00(%[d_re], %[off]), %%ymm0, %%ymm4") 92 __ASM_EMIT ("vmulps 0x20(%[d_re], %[off]), %%ymm1, %%ymm5") 93 __ASM_EMIT ("vmulps 0x00(%[d_im], %[off]), %%ymm0, %%ymm6") 94 __ASM_EMIT ("vmulps 0x20(%[d_im], %[off]), %%ymm1, %%ymm7") 95 __ASM_EMIT ("vmovups %%ymm4, 0x00(%[d_re], %[off])") 96 __ASM_EMIT ("vmovups %%ymm5, 0x20(%[d_re], %[off])") 97 __ASM_EMIT ("vmovups %%ymm6, 0x00(%[d_im], %[off])") 98 __ASM_EMIT ("vmovups %%ymm7, 0x20(%[d_im], %[off])") 99 __ASM_EMIT ("add $0x40, %[off]") 100 __ASM_EMIT32("subl $16, %[count]") 101 __ASM_EMIT64("sub $16, %[count]") 102 __ASM_EMIT ("jae 1b") 103 __ASM_EMIT ("2:") 104 // x8 block 105 __ASM_EMIT32("addl $8, %[count]") 106 __ASM_EMIT64("add $8, %[count]") 107 __ASM_EMIT ("jl 4f") 108 __ASM_EMIT ("vmulps 0x00(%[d_re], %[off]), %%ymm0, %%ymm4") 109 __ASM_EMIT ("vmulps 0x00(%[d_im], %[off]), %%ymm1, %%ymm6") 110 __ASM_EMIT ("vmovups %%ymm4, 0x00(%[d_re], %[off])") 111 __ASM_EMIT ("vmovups %%ymm6, 0x00(%[d_im], %[off])") 112 __ASM_EMIT ("4:") 113 : [off] "+r" (off), [count] "+r" (count), 114 [k] "+Yz" (k) 115 : [d_re] "r" (re), [d_im] "r" (im) 116 : "cc", "memory", 117 "%xmm1", 118 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 119 ); 120 } 121 } 122 123 124 #endif /* DSP_ARCH_X86_AVX2_FFT_NORMALIZE_H_ */ 125