1 /*
2  * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3  *           (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com>
4  *
5  * This file is part of lsp-plugins
6  * Created on: 13 дек. 2019 г.
7  *
8  * lsp-plugins is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * any later version.
12  *
13  * lsp-plugins is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>.
20  */
21 
22 #ifndef DSP_ARCH_X86_AVX_FASTCONV_H_
23 #define DSP_ARCH_X86_AVX_FASTCONV_H_
24 
25 #ifndef DSP_ARCH_X86_AVX_IMPL
26     #error "This header should not be included directly"
27 #endif /* DSP_ARCH_X86_AVX_IMPL */
28 
29 #include <dsp/arch/x86/avx/fastconv/prepare.h>
30 #include <dsp/arch/x86/avx/fastconv/butterfly.h>
31 #include <dsp/arch/x86/avx/fastconv/apply.h>
32 
33 namespace avx
34 {
fastconv_parse(float * dst,const float * src,size_t rank)35     void fastconv_parse(float *dst, const float *src, size_t rank)
36     {
37         const float *ak = &FFT_A[(rank - 3) << 4];
38         const float *wk = &FFT_DW[(rank - 3) << 4];
39         size_t np       = 1 << (rank - 1);
40         size_t nb       = 1;
41 
42         if (np > 4)
43         {
44             fastconv_direct_prepare(dst, src, ak, wk, np);
45             ak         -= 16;
46             wk         -= 16;
47             np        >>= 1;
48             nb        <<= 1;
49         }
50         else
51             fastconv_direct_unpack(dst, src);
52 
53         while (np > 4)
54         {
55             fastconv_direct_butterfly(dst, ak, wk, np, nb);
56             ak         -= 16;
57             wk         -= 16;
58             np        >>= 1;
59             nb        <<= 1;
60         }
61 
62         fastconv_direct_butterfly_last(dst, nb);
63     }
64 
fastconv_parse_fma3(float * dst,const float * src,size_t rank)65     void fastconv_parse_fma3(float *dst, const float *src, size_t rank)
66     {
67         const float *ak = &FFT_A[(rank - 3) << 4];
68         const float *wk = &FFT_DW[(rank - 3) << 4];
69         size_t np       = 1 << (rank - 1);
70         size_t nb       = 1;
71 
72         if (np > 4)
73         {
74             fastconv_direct_prepare_fma3(dst, src, ak, wk, np);
75             ak         -= 16;
76             wk         -= 16;
77             np        >>= 1;
78             nb        <<= 1;
79         }
80         else
81             fastconv_direct_unpack(dst, src);
82 
83         while (np > 4)
84         {
85             fastconv_direct_butterfly_fma3(dst, ak, wk, np, nb);
86             ak         -= 16;
87             wk         -= 16;
88             np        >>= 1;
89             nb        <<= 1;
90         }
91 
92         fastconv_direct_butterfly_last_fma3(dst, nb);
93     }
94 
fastconv_restore(float * dst,float * tmp,size_t rank)95     void fastconv_restore(float *dst, float *tmp, size_t rank)
96     {
97         size_t nb = 1 << (rank - 3), np = 4;
98         const float *ak = FFT_A;
99         const float *wk = FFT_DW;
100 
101         fastconv_reverse_prepare(tmp, nb);
102         if ((nb >>= 1) <= 0)
103         {
104             fastconv_reverse_unpack(dst, tmp, rank);
105             return;
106         }
107         ak     += 16;
108         wk     += 16;
109         np    <<= 1;
110 
111         while (nb > 1)
112         {
113             fastconv_reverse_butterfly(tmp, ak, wk, np, nb);
114             ak     += 16;
115             wk     += 16;
116             np    <<= 1;
117             nb    >>= 1;
118         }
119 
120         fastconv_reverse_butterfly_last(dst, tmp, ak, wk, np);
121     }
122 
fastconv_restore_fma3(float * dst,float * tmp,size_t rank)123     void fastconv_restore_fma3(float *dst, float *tmp, size_t rank)
124     {
125         size_t nb = 1 << (rank - 3), np = 4;
126         const float *ak = FFT_A;
127         const float *wk = FFT_DW;
128 
129         fastconv_reverse_prepare_fma3(tmp, nb);
130         if ((nb >>= 1) <= 0)
131         {
132             fastconv_reverse_unpack(dst, tmp, rank);
133             return;
134         }
135         ak     += 16;
136         wk     += 16;
137         np    <<= 1;
138 
139         while (nb > 1)
140         {
141             fastconv_reverse_butterfly_fma3(tmp, ak, wk, np, nb);
142             ak     += 16;
143             wk     += 16;
144             np    <<= 1;
145             nb    >>= 1;
146         }
147 
148         fastconv_reverse_butterfly_last_fma3(dst, tmp, ak, wk, np);
149     }
150 
fastconv_apply(float * dst,float * tmp,const float * c1,const float * c2,size_t rank)151     void fastconv_apply(float *dst, float *tmp, const float *c1, const float *c2, size_t rank)
152     {
153         size_t nb = 1 << (rank - 3), np = 4;
154         const float *ak = FFT_A;
155         const float *wk = FFT_DW;
156 
157         fastconv_apply_prepare(tmp, c1, c2, nb);
158         if ((nb >>= 1) <= 0)
159         {
160             fastconv_reverse_unpack_adding(dst, tmp, rank);
161             return;
162         }
163         ak     += 16;
164         wk     += 16;
165         np    <<= 1;
166 
167         while (nb > 1)
168         {
169             fastconv_reverse_butterfly(tmp, ak, wk, np, nb);
170             ak     += 16;
171             wk     += 16;
172             np    <<= 1;
173             nb    >>= 1;
174         }
175 
176         fastconv_reverse_butterfly_last_adding(dst, tmp, ak, wk, np);
177     }
178 
fastconv_apply_fma3(float * dst,float * tmp,const float * c1,const float * c2,size_t rank)179     void fastconv_apply_fma3(float *dst, float *tmp, const float *c1, const float *c2, size_t rank)
180     {
181         size_t nb = 1 << (rank - 3), np = 4;
182         const float *ak = FFT_A;
183         const float *wk = FFT_DW;
184 
185         fastconv_apply_prepare_fma3(tmp, c1, c2, nb);
186         if ((nb >>= 1) <= 0)
187         {
188             fastconv_reverse_unpack_adding(dst, tmp, rank);
189             return;
190         }
191         ak     += 16;
192         wk     += 16;
193         np    <<= 1;
194 
195         while (nb > 1)
196         {
197             fastconv_reverse_butterfly_fma3(tmp, ak, wk, np, nb);
198             ak     += 16;
199             wk     += 16;
200             np    <<= 1;
201             nb    >>= 1;
202         }
203 
204         fastconv_reverse_butterfly_last_adding_fma3(dst, tmp, ak, wk, np);
205     }
206 
fastconv_parse_apply(float * dst,float * tmp,const float * c,const float * src,size_t rank)207     void fastconv_parse_apply(float *dst, float *tmp, const float *c, const float *src, size_t rank)
208     {
209         const float *ak = &FFT_A[(rank - 3) << 4];
210         const float *wk = &FFT_DW[(rank - 3) << 4];
211         size_t np       = 1 << (rank - 1);
212         size_t nb       = 1;
213 
214         if (np > 4)
215         {
216             fastconv_direct_prepare(tmp, src, ak, wk, np);
217             ak         -= 16;
218             wk         -= 16;
219             np        >>= 1;
220             nb        <<= 1;
221         }
222         else
223             fastconv_direct_unpack(tmp, src);
224 
225         while (np > 4)
226         {
227             fastconv_direct_butterfly(tmp, ak, wk, np, nb);
228             ak         -= 16;
229             wk         -= 16;
230             np        >>= 1;
231             nb        <<= 1;
232         }
233 
234         fastconv_apply_internal(tmp, c, nb);
235 
236         if ((nb >>= 1) <= 0)
237         {
238             fastconv_reverse_unpack_adding(dst, tmp, rank);
239             return;
240         }
241         ak     += 16;
242         wk     += 16;
243         np    <<= 1;
244 
245         while (nb > 1)
246         {
247             fastconv_reverse_butterfly(tmp, ak, wk, np, nb);
248             ak     += 16;
249             wk     += 16;
250             np    <<= 1;
251             nb    >>= 1;
252         }
253 
254         fastconv_reverse_butterfly_last_adding(dst, tmp, ak, wk, np);
255     }
256 
fastconv_parse_apply_fma3(float * dst,float * tmp,const float * c,const float * src,size_t rank)257     void fastconv_parse_apply_fma3(float *dst, float *tmp, const float *c, const float *src, size_t rank)
258     {
259         const float *ak = &FFT_A[(rank - 3) << 4];
260         const float *wk = &FFT_DW[(rank - 3) << 4];
261         size_t np       = 1 << (rank - 1);
262         size_t nb       = 1;
263 
264         if (np > 4)
265         {
266             fastconv_direct_prepare_fma3(tmp, src, ak, wk, np);
267             ak         -= 16;
268             wk         -= 16;
269             np        >>= 1;
270             nb        <<= 1;
271         }
272         else
273             fastconv_direct_unpack(tmp, src);
274 
275         while (np > 4)
276         {
277             fastconv_direct_butterfly_fma3(tmp, ak, wk, np, nb);
278             ak         -= 16;
279             wk         -= 16;
280             np        >>= 1;
281             nb        <<= 1;
282         }
283 
284         fastconv_apply_internal_fma3(tmp, c, nb);
285 
286         if ((nb >>= 1) <= 0)
287         {
288             fastconv_reverse_unpack_adding(dst, tmp, rank);
289             return;
290         }
291         ak     += 16;
292         wk     += 16;
293         np    <<= 1;
294 
295         while (nb > 1)
296         {
297             fastconv_reverse_butterfly_fma3(tmp, ak, wk, np, nb);
298             ak     += 16;
299             wk     += 16;
300             np    <<= 1;
301             nb    >>= 1;
302         }
303 
304         fastconv_reverse_butterfly_last_adding_fma3(dst, tmp, ak, wk, np);
305     }
306 }
307 
308 #endif /* DSP_ARCH_X86_AVX_FASTCONV_H_ */
309