1 /* 2 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/> 3 * (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com> 4 * 5 * This file is part of lsp-plugins 6 * Created on: 10 дек. 2019 г. 7 * 8 * lsp-plugins is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Lesser General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * any later version. 12 * 13 * lsp-plugins is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public License 19 * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>. 20 */ 21 22 #ifndef DSP_ARCH_X86_AVX_FFT_BUTTERFLY_H_ 23 #define DSP_ARCH_X86_AVX_FFT_BUTTERFLY_H_ 24 25 #ifndef DSP_ARCH_X86_AVX_IMPL 26 #error "This header should not be included directly" 27 #endif /* DSP_ARCH_X86_AVX_IMPL */ 28 29 #define FFT_BUTTERFLY_BODY8(add_b, add_a, FMA_SEL) \ 30 __IF_32(float *ptr1, *ptr2);\ 31 \ 32 ARCH_X86_ASM \ 33 ( \ 34 /* Prepare angle */ \ 35 __ASM_EMIT32("mov %[fft_a], %[ptr2]") \ 36 __ASM_EMIT32("mov %[dst_re], %[ptr1]") \ 37 __ASM_EMIT("vmovaps 0x00(%[" __IF_32_64("ptr2", "fft_a") "]), %%ymm6") /* ymm6 = x_re */ \ 38 __ASM_EMIT("vmovaps 0x20(%[" __IF_32_64("ptr2", "fft_a") "]), %%ymm7") /* ymm7 = x_im */ \ 39 __ASM_EMIT32("mov %[dst_im], %[ptr2]") \ 40 /* Start loop */ \ 41 __ASM_EMIT("1:") \ 42 __ASM_EMIT("vmovups 0x00(%[" __IF_32_64("ptr1", "dst_re") "], %[off1]), %%ymm0") /* ymm0 = a_re */ \ 43 __ASM_EMIT("vmovups 0x00(%[" __IF_32_64("ptr1", "dst_re") "], %[off2]), %%ymm2") /* ymm2 = b_re */ \ 44 __ASM_EMIT("vmovups 0x00(%[" __IF_32_64("ptr2", "dst_im") "], %[off1]), %%ymm1") /* ymm1 = a_im */ \ 45 __ASM_EMIT("vmovups 0x00(%[" __IF_32_64("ptr2", "dst_im") "], %[off2]), %%ymm3") /* ymm3 = b_im */ \ 46 /* Calculate complex multiplication */ \ 47 __ASM_EMIT("vmulps %%ymm7, %%ymm2, %%ymm4") /* ymm4 = x_im * b_re */ \ 48 __ASM_EMIT("vmulps %%ymm7, %%ymm3, %%ymm5") /* ymm5 = x_im * b_im */ \ 49 __ASM_EMIT(FMA_SEL("vmulps %%ymm6, %%ymm2, %%ymm2", "")) /* ymm2 = x_re * b_re */ \ 50 __ASM_EMIT(FMA_SEL("vmulps %%ymm6, %%ymm3, %%ymm3", "")) /* ymm3 = x_re * b_im */ \ 51 __ASM_EMIT(FMA_SEL(add_b " %%ymm5, %%ymm2, %%ymm5", add_b " %%ymm6, %%ymm2, %%ymm5")) /* ymm5 = c_re = x_re * b_re +- x_im * b_im */ \ 52 __ASM_EMIT(FMA_SEL(add_a " %%ymm4, %%ymm3, %%ymm4", add_a " %%ymm6, %%ymm3, %%ymm4")) /* ymm4 = c_im = x_re * b_im -+ x_im * b_re */ \ 53 /* Perform butterfly */ \ 54 __ASM_EMIT("vsubps %%ymm5, %%ymm0, %%ymm2") /* ymm2 = a_re - c_re */ \ 55 __ASM_EMIT("vsubps %%ymm4, %%ymm1, %%ymm3") /* ymm3 = a_im - c_im */ \ 56 __ASM_EMIT("vaddps %%ymm5, %%ymm0, %%ymm0") /* ymm0 = a_re + c_re */ \ 57 __ASM_EMIT("vaddps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = a_im + c_im */ \ 58 /* Store values */ \ 59 __ASM_EMIT("vmovups %%ymm0, 0x00(%[" __IF_32_64("ptr1", "dst_re") "], %[off1])") \ 60 __ASM_EMIT("vmovups %%ymm2, 0x00(%[" __IF_32_64("ptr1", "dst_re") "], %[off2])") \ 61 __ASM_EMIT("vmovups %%ymm1, 0x00(%[" __IF_32_64("ptr2", "dst_im") "], %[off1])") \ 62 __ASM_EMIT("vmovups %%ymm3, 0x00(%[" __IF_32_64("ptr2", "dst_im") "], %[off2])") \ 63 __ASM_EMIT("add $0x20, %[off1]") \ 64 __ASM_EMIT("add $0x20, %[off2]") \ 65 __ASM_EMIT32("subl $8, %[np]") \ 66 __ASM_EMIT64("subq $8, %[np]") \ 67 __ASM_EMIT("jz 2f") \ 68 /* Rotate angle */ \ 69 __ASM_EMIT32("mov %[fft_w], %[ptr2]") \ 70 __ASM_EMIT("vmovaps 0x00(%[" __IF_32_64("ptr2", "fft_w") "]), %%ymm4") /* xmm4 = w_re */ \ 71 __ASM_EMIT("vmovaps 0x20(%[" __IF_32_64("ptr2", "fft_w") "]), %%ymm5") /* xmm5 = w_im */ \ 72 __ASM_EMIT32("mov %[dst_im], %[ptr2]") \ 73 __ASM_EMIT("vmulps %%ymm5, %%ymm6, %%ymm2") /* ymm2 = w_im * x_re */ \ 74 __ASM_EMIT("vmulps %%ymm5, %%ymm7, %%ymm3") /* ymm3 = w_im * x_im */ \ 75 __ASM_EMIT(FMA_SEL("vmulps %%ymm4, %%ymm6, %%ymm6", "")) /* ymm6 = w_re * x_re */ \ 76 __ASM_EMIT(FMA_SEL("vmulps %%ymm4, %%ymm7, %%ymm7", "")) /* ymm7 = w_re * x_im */ \ 77 __ASM_EMIT(FMA_SEL("vsubps %%ymm3, %%ymm6, %%ymm6", "vfmsub132ps %%ymm4, %%ymm3, %%ymm6")) /* ymm6 = x_re' = w_re * x_re - w_im * x_im */ \ 78 __ASM_EMIT(FMA_SEL("vaddps %%ymm2, %%ymm7, %%ymm7", "vfmadd132ps %%ymm4, %%ymm2, %%ymm7")) /* ymm7 = x_im' = w_re * x_im + w_im * x_re */ \ 79 /* Repeat loop */ \ 80 __ASM_EMIT("jmp 1b") \ 81 __ASM_EMIT("2:") \ 82 \ 83 : __IF_32([ptr1] "=&r" (ptr1), [ptr2] "=&r" (ptr2), ) \ 84 [off1] "+r" (off1), [off2] "+r" (off2), [np] __ASM_ARG_RW(np) \ 85 : __IF_32([dst_re] "g" (dst_re), [dst_im] "g" (dst_im), [fft_a] "g" (fft_a), [fft_w] "g" (fft_w)) \ 86 __IF_64([dst_re] "r" (dst_re), [dst_im] "r" (dst_im), [fft_a] "r" (fft_a), [fft_w] "r" (fft_w)) \ 87 : "cc", "memory", \ 88 "%xmm0", "%xmm1", "%xmm2", "%xmm3", \ 89 "%xmm4", "%xmm5", "%xmm6", "%xmm7" \ 90 ); 91 92 namespace avx 93 { 94 #define FMA_OFF(a, b) a 95 #define FMA_ON(a, b) b 96 butterfly_direct8p(float * dst_re,float * dst_im,size_t rank,size_t blocks)97 static inline void butterfly_direct8p(float *dst_re, float *dst_im, size_t rank, size_t blocks) 98 { 99 size_t pairs = 1 << rank; 100 size_t off1 = 0, shift = 4 << rank; //1 << (rank + 2); 101 const float *fft_a = &FFT_A[(rank - 2) << 4]; 102 const float *fft_w = &FFT_DW[(rank - 2) << 4]; 103 104 for (size_t b=0; b<blocks; ++b) 105 { 106 size_t off2 = off1 + shift; 107 size_t np = pairs; 108 109 FFT_BUTTERFLY_BODY8("vaddps", "vsubps", FMA_OFF); 110 111 off1 = off2; 112 } 113 } 114 butterfly_reverse8p(float * dst_re,float * dst_im,size_t rank,size_t blocks)115 static inline void butterfly_reverse8p(float *dst_re, float *dst_im, size_t rank, size_t blocks) 116 { 117 size_t pairs = 1 << rank; 118 size_t off1 = 0, shift = 4 << rank; // 1 << (rank + 2); 119 const float *fft_a = &FFT_A[(rank - 2) << 4]; 120 const float *fft_w = &FFT_DW[(rank - 2) << 4]; 121 122 for (size_t b=0; b<blocks; ++b) 123 { 124 size_t off2 = off1 + shift; 125 size_t np = pairs; 126 127 FFT_BUTTERFLY_BODY8("vsubps", "vaddps", FMA_OFF); 128 129 off1 = off2; 130 } 131 } 132 butterfly_direct8p_fma3(float * dst_re,float * dst_im,size_t rank,size_t blocks)133 static inline void butterfly_direct8p_fma3(float *dst_re, float *dst_im, size_t rank, size_t blocks) 134 { 135 size_t pairs = 1 << rank; 136 size_t off1 = 0, shift = 4 << rank; // 1 << (rank + 2); 137 const float *fft_a = &FFT_A[(rank - 2) << 4]; 138 const float *fft_w = &FFT_DW[(rank - 2) << 4]; 139 140 for (size_t b=0; b<blocks; ++b) 141 { 142 size_t off2 = off1 + shift; 143 size_t np = pairs; 144 145 FFT_BUTTERFLY_BODY8("vfmadd231ps", "vfmsub231ps", FMA_ON); 146 147 off1 = off2; 148 } 149 } 150 butterfly_reverse8p_fma3(float * dst_re,float * dst_im,size_t rank,size_t blocks)151 static inline void butterfly_reverse8p_fma3(float *dst_re, float *dst_im, size_t rank, size_t blocks) 152 { 153 size_t pairs = 1 << rank; 154 size_t off1 = 0, shift = 4 << rank; // 1 << (rank + 2); 155 const float *fft_a = &FFT_A[(rank - 2) << 4]; 156 const float *fft_w = &FFT_DW[(rank - 2) << 4]; 157 158 for (size_t b=0; b<blocks; ++b) 159 { 160 size_t off2 = off1 + shift; 161 size_t np = pairs; 162 163 FFT_BUTTERFLY_BODY8("vfmsub231ps", "vfmadd231ps", FMA_ON); 164 165 off1 = off2; 166 } 167 } 168 169 #undef FMA_OFF 170 #undef FMA_ON 171 } 172 173 #undef FFT_BUTTERFLY_BODY8 174 175 #endif /* DSP_ARCH_X86_AVX_FFT_BUTTERFLY_H_ */ 176