1 /*
2  * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3  *           (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com>
4  *
5  * This file is part of lsp-plugins
6  * Created on: 10 дек. 2019 г.
7  *
8  * lsp-plugins is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * any later version.
12  *
13  * lsp-plugins is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>.
20  */
21 
22 #ifndef DSP_ARCH_X86_AVX_FFT_BUTTERFLY_H_
23 #define DSP_ARCH_X86_AVX_FFT_BUTTERFLY_H_
24 
25 #ifndef DSP_ARCH_X86_AVX_IMPL
26     #error "This header should not be included directly"
27 #endif /* DSP_ARCH_X86_AVX_IMPL */
28 
29 #define FFT_BUTTERFLY_BODY8(add_b, add_a, FMA_SEL) \
30     __IF_32(float *ptr1, *ptr2);\
31     \
32     ARCH_X86_ASM \
33     ( \
34         /* Prepare angle */ \
35         __ASM_EMIT32("mov           %[fft_a], %[ptr2]") \
36         __ASM_EMIT32("mov           %[dst_re], %[ptr1]") \
37         __ASM_EMIT("vmovaps         0x00(%[" __IF_32_64("ptr2", "fft_a") "]), %%ymm6")        /* ymm6 = x_re */ \
38         __ASM_EMIT("vmovaps         0x20(%[" __IF_32_64("ptr2", "fft_a") "]), %%ymm7")        /* ymm7 = x_im */ \
39         __ASM_EMIT32("mov           %[dst_im], %[ptr2]") \
40         /* Start loop */ \
41         __ASM_EMIT("1:") \
42             __ASM_EMIT("vmovups         0x00(%[" __IF_32_64("ptr1", "dst_re") "], %[off1]), %%ymm0")    /* ymm0 = a_re */ \
43             __ASM_EMIT("vmovups         0x00(%[" __IF_32_64("ptr1", "dst_re") "], %[off2]), %%ymm2")    /* ymm2 = b_re */ \
44             __ASM_EMIT("vmovups         0x00(%[" __IF_32_64("ptr2", "dst_im") "], %[off1]), %%ymm1")    /* ymm1 = a_im */ \
45             __ASM_EMIT("vmovups         0x00(%[" __IF_32_64("ptr2", "dst_im") "], %[off2]), %%ymm3")    /* ymm3 = b_im */ \
46             /* Calculate complex multiplication */ \
47             __ASM_EMIT("vmulps          %%ymm7, %%ymm2, %%ymm4")            /* ymm4 = x_im * b_re */ \
48             __ASM_EMIT("vmulps          %%ymm7, %%ymm3, %%ymm5")            /* ymm5 = x_im * b_im */ \
49             __ASM_EMIT(FMA_SEL("vmulps  %%ymm6, %%ymm2, %%ymm2", ""))       /* ymm2 = x_re * b_re */ \
50             __ASM_EMIT(FMA_SEL("vmulps  %%ymm6, %%ymm3, %%ymm3", ""))       /* ymm3 = x_re * b_im */ \
51             __ASM_EMIT(FMA_SEL(add_b "  %%ymm5, %%ymm2, %%ymm5", add_b " %%ymm6, %%ymm2, %%ymm5")) /* ymm5 = c_re = x_re * b_re +- x_im * b_im */ \
52             __ASM_EMIT(FMA_SEL(add_a "  %%ymm4, %%ymm3, %%ymm4", add_a " %%ymm6, %%ymm3, %%ymm4")) /* ymm4 = c_im = x_re * b_im -+ x_im * b_re */ \
53             /* Perform butterfly */ \
54             __ASM_EMIT("vsubps          %%ymm5, %%ymm0, %%ymm2")            /* ymm2 = a_re - c_re */ \
55             __ASM_EMIT("vsubps          %%ymm4, %%ymm1, %%ymm3")            /* ymm3 = a_im - c_im */ \
56             __ASM_EMIT("vaddps          %%ymm5, %%ymm0, %%ymm0")            /* ymm0 = a_re + c_re */ \
57             __ASM_EMIT("vaddps          %%ymm4, %%ymm1, %%ymm1")            /* ymm1 = a_im + c_im */ \
58             /* Store values */ \
59             __ASM_EMIT("vmovups         %%ymm0, 0x00(%[" __IF_32_64("ptr1", "dst_re") "], %[off1])") \
60             __ASM_EMIT("vmovups         %%ymm2, 0x00(%[" __IF_32_64("ptr1", "dst_re") "], %[off2])") \
61             __ASM_EMIT("vmovups         %%ymm1, 0x00(%[" __IF_32_64("ptr2", "dst_im") "], %[off1])") \
62             __ASM_EMIT("vmovups         %%ymm3, 0x00(%[" __IF_32_64("ptr2", "dst_im") "], %[off2])") \
63             __ASM_EMIT("add             $0x20, %[off1]") \
64             __ASM_EMIT("add             $0x20, %[off2]") \
65             __ASM_EMIT32("subl          $8, %[np]") \
66             __ASM_EMIT64("subq          $8, %[np]") \
67             __ASM_EMIT("jz              2f") \
68             /* Rotate angle */ \
69             __ASM_EMIT32("mov           %[fft_w], %[ptr2]") \
70             __ASM_EMIT("vmovaps         0x00(%[" __IF_32_64("ptr2", "fft_w") "]), %%ymm4")        /* xmm4 = w_re */ \
71             __ASM_EMIT("vmovaps         0x20(%[" __IF_32_64("ptr2", "fft_w") "]), %%ymm5")        /* xmm5 = w_im */ \
72             __ASM_EMIT32("mov           %[dst_im], %[ptr2]") \
73             __ASM_EMIT("vmulps          %%ymm5, %%ymm6, %%ymm2")            /* ymm2 = w_im * x_re */ \
74             __ASM_EMIT("vmulps          %%ymm5, %%ymm7, %%ymm3")            /* ymm3 = w_im * x_im */ \
75             __ASM_EMIT(FMA_SEL("vmulps  %%ymm4, %%ymm6, %%ymm6", ""))       /* ymm6 = w_re * x_re */ \
76             __ASM_EMIT(FMA_SEL("vmulps  %%ymm4, %%ymm7, %%ymm7", ""))       /* ymm7 = w_re * x_im */ \
77             __ASM_EMIT(FMA_SEL("vsubps  %%ymm3, %%ymm6, %%ymm6", "vfmsub132ps %%ymm4, %%ymm3, %%ymm6")) /* ymm6 = x_re' = w_re * x_re - w_im * x_im */ \
78             __ASM_EMIT(FMA_SEL("vaddps  %%ymm2, %%ymm7, %%ymm7", "vfmadd132ps %%ymm4, %%ymm2, %%ymm7")) /* ymm7 = x_im' = w_re * x_im + w_im * x_re */ \
79             /* Repeat loop */ \
80         __ASM_EMIT("jmp             1b") \
81         __ASM_EMIT("2:") \
82         \
83         : __IF_32([ptr1] "=&r" (ptr1), [ptr2] "=&r" (ptr2), ) \
84           [off1] "+r" (off1), [off2] "+r" (off2), [np] __ASM_ARG_RW(np) \
85         : __IF_32([dst_re] "g" (dst_re), [dst_im] "g" (dst_im), [fft_a] "g" (fft_a), [fft_w] "g" (fft_w)) \
86           __IF_64([dst_re] "r" (dst_re), [dst_im] "r" (dst_im), [fft_a] "r" (fft_a), [fft_w] "r" (fft_w)) \
87         : "cc", "memory",  \
88         "%xmm0", "%xmm1", "%xmm2", "%xmm3", \
89         "%xmm4", "%xmm5", "%xmm6", "%xmm7"  \
90     );
91 
92 namespace avx
93 {
94 #define FMA_OFF(a, b)       a
95 #define FMA_ON(a, b)        b
96 
butterfly_direct8p(float * dst_re,float * dst_im,size_t rank,size_t blocks)97     static inline void butterfly_direct8p(float *dst_re, float *dst_im, size_t rank, size_t blocks)
98     {
99         size_t pairs = 1 << rank;
100         size_t off1 = 0, shift = 4 << rank; //1 << (rank + 2);
101         const float *fft_a = &FFT_A[(rank - 2) << 4];
102         const float *fft_w = &FFT_DW[(rank - 2) << 4];
103 
104         for (size_t b=0; b<blocks; ++b)
105         {
106             size_t off2  = off1 + shift;
107             size_t np    = pairs;
108 
109             FFT_BUTTERFLY_BODY8("vaddps", "vsubps", FMA_OFF);
110 
111             off1        = off2;
112         }
113     }
114 
butterfly_reverse8p(float * dst_re,float * dst_im,size_t rank,size_t blocks)115     static inline void butterfly_reverse8p(float *dst_re, float *dst_im, size_t rank, size_t blocks)
116     {
117         size_t pairs = 1 << rank;
118         size_t off1 = 0, shift = 4 << rank; // 1 << (rank + 2);
119         const float *fft_a = &FFT_A[(rank - 2) << 4];
120         const float *fft_w = &FFT_DW[(rank - 2) << 4];
121 
122         for (size_t b=0; b<blocks; ++b)
123         {
124             size_t off2  = off1 + shift;
125             size_t np    = pairs;
126 
127             FFT_BUTTERFLY_BODY8("vsubps", "vaddps", FMA_OFF);
128 
129             off1        = off2;
130         }
131     }
132 
butterfly_direct8p_fma3(float * dst_re,float * dst_im,size_t rank,size_t blocks)133     static inline void butterfly_direct8p_fma3(float *dst_re, float *dst_im, size_t rank, size_t blocks)
134     {
135         size_t pairs = 1 << rank;
136         size_t off1 = 0, shift = 4 << rank; // 1 << (rank + 2);
137         const float *fft_a = &FFT_A[(rank - 2) << 4];
138         const float *fft_w = &FFT_DW[(rank - 2) << 4];
139 
140         for (size_t b=0; b<blocks; ++b)
141         {
142             size_t off2  = off1 + shift;
143             size_t np    = pairs;
144 
145             FFT_BUTTERFLY_BODY8("vfmadd231ps", "vfmsub231ps", FMA_ON);
146 
147             off1        = off2;
148         }
149     }
150 
butterfly_reverse8p_fma3(float * dst_re,float * dst_im,size_t rank,size_t blocks)151     static inline void butterfly_reverse8p_fma3(float *dst_re, float *dst_im, size_t rank, size_t blocks)
152     {
153         size_t pairs = 1 << rank;
154         size_t off1 = 0, shift = 4 << rank; // 1 << (rank + 2);
155         const float *fft_a = &FFT_A[(rank - 2) << 4];
156         const float *fft_w = &FFT_DW[(rank - 2) << 4];
157 
158         for (size_t b=0; b<blocks; ++b)
159         {
160             size_t off2  = off1 + shift;
161             size_t np    = pairs;
162 
163             FFT_BUTTERFLY_BODY8("vfmsub231ps", "vfmadd231ps", FMA_ON);
164 
165             off1        = off2;
166         }
167     }
168 
169 #undef FMA_OFF
170 #undef FMA_ON
171 }
172 
173 #undef FFT_BUTTERFLY_BODY8
174 
175 #endif /* DSP_ARCH_X86_AVX_FFT_BUTTERFLY_H_ */
176