1 /*
2  * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3  *           (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com>
4  *
5  * This file is part of lsp-plugins
6  * Created on: 31 окт. 2018 г.
7  *
8  * lsp-plugins is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * any later version.
12  *
13  * lsp-plugins is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <dsp/dsp.h>
23 #include <test/mtest.h>
24 #include <test/FloatBuffer.h>
25 #include <core/debug.h>
26 
27 #define RANK        6
28 #define BUF_SIZE    (1 << RANK)
29 
30 static const float XFFT_DW[] __lsp_aligned16 =
31 {
32     // Re, Im
33     0.0000000000000000f, 1.0000000000000000f,
34     0.0000000000000000f, 1.0000000000000000f,
35     0.7071067811865475f, 0.7071067811865475f,
36     0.9238795325112868f, 0.3826834323650898f,
37     0.9807852804032305f, 0.1950903220161283f,
38     0.9951847266721969f, 0.0980171403295606f,
39     0.9987954562051724f, 0.0490676743274180f,
40     0.9996988186962042f, 0.0245412285229123f,
41     0.9999247018391445f, 0.0122715382857199f,
42     0.9999811752826011f, 0.0061358846491545f,
43     0.9999952938095762f, 0.0030679567629660f,
44     0.9999988234517019f, 0.0015339801862848f,
45     0.9999997058628822f, 0.0007669903187427f,
46     0.9999999264657179f, 0.0003834951875714f,
47     0.9999999816164293f, 0.0001917475973107f,
48     0.9999999954041073f, 0.0000958737990960f,
49     0.9999999988510268f, 0.0000479368996031f
50 };
51 
52 static const float XFFT_A_RE[] __lsp_aligned16 =
53 {
54     1.0000000000000000f, 0.7071067811865475f, 0.0000000000000000f, -0.7071067811865475f,
55     1.0000000000000000f, 0.9238795325112868f, 0.7071067811865475f, 0.3826834323650898f,
56     1.0000000000000000f, 0.9807852804032305f, 0.9238795325112868f, 0.8314696123025452f,
57     1.0000000000000000f, 0.9951847266721969f, 0.9807852804032305f, 0.9569403357322089f,
58     1.0000000000000000f, 0.9987954562051724f, 0.9951847266721969f, 0.9891765099647810f,
59     1.0000000000000000f, 0.9996988186962042f, 0.9987954562051724f, 0.9972904566786902f,
60     1.0000000000000000f, 0.9999247018391445f, 0.9996988186962042f, 0.9993223845883495f,
61     1.0000000000000000f, 0.9999811752826011f, 0.9999247018391445f, 0.9998305817958234f,
62     1.0000000000000000f, 0.9999952938095762f, 0.9999811752826011f, 0.9999576445519639f,
63     1.0000000000000000f, 0.9999988234517019f, 0.9999952938095762f, 0.9999894110819284f,
64     1.0000000000000000f, 0.9999997058628822f, 0.9999988234517019f, 0.9999973527669782f,
65     1.0000000000000000f, 0.9999999264657179f, 0.9999997058628822f, 0.9999993381915255f,
66     1.0000000000000000f, 0.9999999816164293f, 0.9999999264657179f, 0.9999998345478677f,
67     1.0000000000000000f, 0.9999999954041073f, 0.9999999816164293f, 0.9999999586369661f,
68     1.0000000000000000f, 0.9999999988510268f, 0.9999999954041073f, 0.9999999896592415f
69 };
70 
71 static const float XFFT_A_IM[] __lsp_aligned16 =
72 {
73     0.0000000000000000f, 0.7071067811865475f, 1.0000000000000000f, 0.7071067811865476f,
74     0.0000000000000000f, 0.3826834323650898f, 0.7071067811865475f, 0.9238795325112867f,
75     0.0000000000000000f, 0.1950903220161283f, 0.3826834323650898f, 0.5555702330196022f,
76     0.0000000000000000f, 0.0980171403295606f, 0.1950903220161283f, 0.2902846772544624f,
77     0.0000000000000000f, 0.0490676743274180f, 0.0980171403295606f, 0.1467304744553617f,
78     0.0000000000000000f, 0.0245412285229123f, 0.0490676743274180f, 0.0735645635996674f,
79     0.0000000000000000f, 0.0122715382857199f, 0.0245412285229123f, 0.0368072229413588f,
80     0.0000000000000000f, 0.0061358846491545f, 0.0122715382857199f, 0.0184067299058048f,
81     0.0000000000000000f, 0.0030679567629660f, 0.0061358846491545f, 0.0092037547820598f,
82     0.0000000000000000f, 0.0015339801862848f, 0.0030679567629660f, 0.0046019261204486f,
83     0.0000000000000000f, 0.0007669903187427f, 0.0015339801862848f, 0.0023009691514258f,
84     0.0000000000000000f, 0.0003834951875714f, 0.0007669903187427f, 0.0011504853371138f,
85     0.0000000000000000f, 0.0001917475973107f, 0.0003834951875714f, 0.0005752427637321f,
86     0.0000000000000000f, 0.0000958737990960f, 0.0001917475973107f, 0.0002876213937629f,
87     0.0000000000000000f, 0.0000479368996031f, 0.0000958737990960f, 0.0001438106983686f
88 };
89 
fastconv_parse(float * dst,const float * src,size_t rank)90 static void fastconv_parse(float *dst, const float *src, size_t rank)
91 {
92     // Prepare for butterflies
93     float c_re[4], c_im[4], w_re[4], w_im[4];
94     const float *dw     = &XFFT_DW[(rank - 3) << 1];
95     const float *iw_re  = &XFFT_A_RE[(rank-3) << 2];
96     const float *iw_im  = &XFFT_A_IM[(rank-3) << 2];
97     size_t items        = size_t(1) << (rank + 1);
98     size_t bs           = items;
99     size_t n            = bs >> 1;
100 
101     // Iterate first cycle
102     if (n > 4)
103     {
104         // ONE LARGE CYCLE
105         // Set initial values of pointers
106         float *a            = dst;
107         float *b            = &a[n];
108 
109         w_re[0]             = iw_re[0];
110         w_re[1]             = iw_re[1];
111         w_re[2]             = iw_re[2];
112         w_re[3]             = iw_re[3];
113         w_im[0]             = iw_im[0];
114         w_im[1]             = iw_im[1];
115         w_im[2]             = iw_im[2];
116         w_im[3]             = iw_im[3];
117 
118         for (size_t k=0; ;)
119         {
120             // Calculate the output values:
121             // a'   = a + 0
122             // b'   = (a-0) * w
123             a[0]            = src[0];
124             a[1]            = src[1];
125             a[2]            = src[2];
126             a[3]            = src[3];
127 
128             a[4]            = 0.0f;
129             a[5]            = 0.0f;
130             a[6]            = 0.0f;
131             a[7]            = 0.0f;
132 
133             // Calculate complex c = w * b
134             b[0]            = w_re[0] * a[0];
135             b[1]            = w_re[1] * a[1];
136             b[2]            = w_re[2] * a[2];
137             b[3]            = w_re[3] * a[3];
138 
139             b[4]            = -w_im[0] * a[0];
140             b[5]            = -w_im[1] * a[1];
141             b[6]            = -w_im[2] * a[2];
142             b[7]            = -w_im[3] * a[3];
143 
144             // Update pointers
145             a              += 8;
146             b              += 8;
147             src            += 4;
148 
149             if ((k += 8) >= n)
150                 break;
151 
152             // Rotate w vector
153             c_re[0]         = w_re[0]*dw[0] - w_im[0]*dw[1];
154             c_re[1]         = w_re[1]*dw[0] - w_im[1]*dw[1];
155             c_re[2]         = w_re[2]*dw[0] - w_im[2]*dw[1];
156             c_re[3]         = w_re[3]*dw[0] - w_im[3]*dw[1];
157 
158             c_im[0]         = w_re[0]*dw[1] + w_im[0]*dw[0];
159             c_im[1]         = w_re[1]*dw[1] + w_im[1]*dw[0];
160             c_im[2]         = w_re[2]*dw[1] + w_im[2]*dw[0];
161             c_im[3]         = w_re[3]*dw[1] + w_im[3]*dw[0];
162 
163             w_re[0]         = c_re[0];
164             w_re[1]         = c_re[1];
165             w_re[2]         = c_re[2];
166             w_re[3]         = c_re[3];
167 
168             w_im[0]         = c_im[0];
169             w_im[1]         = c_im[1];
170             w_im[2]         = c_im[2];
171             w_im[3]         = c_im[3];
172         }
173 
174         dw     -= 2;
175         iw_re  -= 4;
176         iw_im  -= 4;
177 
178         n >>= 1;
179         bs >>= 1;
180     }
181     else
182     {
183         // Unpack 4x real to 4x split complex
184         dst[0]      = src[0];
185         dst[1]      = src[1];
186         dst[2]      = src[2];
187         dst[3]      = src[3];
188 
189         dst[4]      = 0.0f;
190         dst[5]      = 0.0f;
191         dst[6]      = 0.0f;
192         dst[7]      = 0.0f;
193     }
194 
195     // Iterate butterflies
196     for (; n > 4; n >>= 1, bs >>= 1)
197     {
198         for (size_t p=0; p<items; p += bs)
199         {
200             // Set initial values of pointers
201             float *a            = &dst[p];
202             float *b            = &a[n];
203 
204             w_re[0]             = iw_re[0];
205             w_re[1]             = iw_re[1];
206             w_re[2]             = iw_re[2];
207             w_re[3]             = iw_re[3];
208             w_im[0]             = iw_im[0];
209             w_im[1]             = iw_im[1];
210             w_im[2]             = iw_im[2];
211             w_im[3]             = iw_im[3];
212 
213             for (size_t k=0; ;)
214             {
215                 // Calculate the output values:
216                 // c    = a - b
217                 // a'   = a + b
218                 // b'   = c * w
219                 c_re[0]         = a[0] - b[0];
220                 c_re[1]         = a[1] - b[1];
221                 c_re[2]         = a[2] - b[2];
222                 c_re[3]         = a[3] - b[3];
223 
224                 c_im[0]         = a[4] - b[4];
225                 c_im[1]         = a[5] - b[5];
226                 c_im[2]         = a[6] - b[6];
227                 c_im[3]         = a[7] - b[7];
228 
229                 a[0]            = a[0] + b[0];
230                 a[1]            = a[1] + b[1];
231                 a[2]            = a[2] + b[2];
232                 a[3]            = a[3] + b[3];
233 
234                 a[4]            = a[4] + b[4];
235                 a[5]            = a[5] + b[5];
236                 a[6]            = a[6] + b[6];
237                 a[7]            = a[7] + b[7];
238 
239                 // Calculate complex c = w * b
240                 b[0]            = w_re[0] * c_re[0] + w_im[0] * c_im[0];
241                 b[1]            = w_re[1] * c_re[1] + w_im[1] * c_im[1];
242                 b[2]            = w_re[2] * c_re[2] + w_im[2] * c_im[2];
243                 b[3]            = w_re[3] * c_re[3] + w_im[3] * c_im[3];
244 
245                 b[4]            = w_re[0] * c_im[0] - w_im[0] * c_re[0];
246                 b[5]            = w_re[1] * c_im[1] - w_im[1] * c_re[1];
247                 b[6]            = w_re[2] * c_im[2] - w_im[2] * c_re[2];
248                 b[7]            = w_re[3] * c_im[3] - w_im[3] * c_re[3];
249 
250                 // Update pointers
251                 a              += 8;
252                 b              += 8;
253 
254                 if ((k += 8) >= n)
255                     break;
256 
257                 // Rotate w vector
258                 c_re[0]         = w_re[0]*dw[0] - w_im[0]*dw[1];
259                 c_re[1]         = w_re[1]*dw[0] - w_im[1]*dw[1];
260                 c_re[2]         = w_re[2]*dw[0] - w_im[2]*dw[1];
261                 c_re[3]         = w_re[3]*dw[0] - w_im[3]*dw[1];
262 
263                 c_im[0]         = w_re[0]*dw[1] + w_im[0]*dw[0];
264                 c_im[1]         = w_re[1]*dw[1] + w_im[1]*dw[0];
265                 c_im[2]         = w_re[2]*dw[1] + w_im[2]*dw[0];
266                 c_im[3]         = w_re[3]*dw[1] + w_im[3]*dw[0];
267 
268                 w_re[0]         = c_re[0];
269                 w_re[1]         = c_re[1];
270                 w_re[2]         = c_re[2];
271                 w_re[3]         = c_re[3];
272 
273                 w_im[0]         = c_im[0];
274                 w_im[1]         = c_im[1];
275                 w_im[2]         = c_im[2];
276                 w_im[3]         = c_im[3];
277             }
278         }
279 
280         dw     -= 2;
281         iw_re  -= 4;
282         iw_im  -= 4;
283     }
284 
285     // Add two last stages
286     for (size_t i=0; i<items; i += 8)
287     {
288         // s0' = s0 + s2            = (r0 + r2) + j*(i0 + i2)
289         // s1' = s1 + s3            = (r1 + r3) + j*(i1 + i3)
290         // s2' = s0 - s2            = 1*((r0 - r2) + j*(i0 - i2)) = (r0 - r2) + j*(i0 - i2)
291         // s3' = -j*(s1 - s3)       = -j*((r1 - r3) + j*(i1 - i3)) = (i1 - i3) - j*(r1 - r3)
292 
293         // s0" = s0' + s1'          = (r0 + r2) + j*(i0 + i2) + (r1 + r3) + j*(i1 + i3)
294         // s1" = s0' - s1'          = (r0 + r2) + j*(i0 + i2) - (r1 + r3) - j*(i1 + i3)
295         // s2" = s2' + s3'          = (r0 - r2) + j*(i0 - i2) + (i1 - i3) - j*(r1 - r3)
296         // s3" = s2' - s3'          = (r0 - r2) + j*(i0 - i2) - (i1 - i3) + j*(r1 - r3)
297 
298         float r0k       = dst[0] + dst[2];
299         float r1k       = dst[0] - dst[2];
300         float r2k       = dst[1] + dst[3];
301         float r3k       = dst[1] - dst[3];
302 
303         float i0k       = dst[4] + dst[6];
304         float i1k       = dst[4] - dst[6];
305         float i2k       = dst[5] + dst[7];
306         float i3k       = dst[5] - dst[7];
307 
308 //        dst[0]          = r0k;
309 //        dst[1]          = r2k;
310 //        dst[2]          = r1k;
311 //        dst[3]          = i3k;
312 //
313 //        dst[4]          = i0k;
314 //        dst[5]          = i2k;
315 //        dst[6]          = i1k;
316 //        dst[7]          = r3k;
317 
318         dst[0]          = r0k + r2k;
319         dst[1]          = r0k - r2k;
320         dst[2]          = r1k + i3k;
321         dst[3]          = r1k - i3k;
322 
323         dst[4]          = i0k + i2k;
324         dst[5]          = i0k - i2k;
325         dst[6]          = i1k - r3k;
326         dst[7]          = i1k + r3k;
327 
328         dst            += 8;
329     }
330 
331     // Now all complex numbers are stored in the following rormat:
332     // [r0 r1 r2 r3 i0 i1 i2 i3  r4 r5 r6 r7 i4 i5 i6 i7  ... ]
333 }
334 
fastconv_restore(float * dst,float * tmp,size_t rank)335 static void fastconv_restore(float *dst, float *tmp, size_t rank)
336 {
337     float c_re[4], c_im[4], w_re[4], w_im[4];
338 
339     // Prepare for butterflies
340     size_t last     = size_t(1) << rank;
341     size_t items    = last << 1;
342     size_t n        = 8;
343     size_t bs       = n << 1;
344 
345     // Add two last stages
346     float *d        = tmp;
347 
348     // All complex numbers are stored in the following format:
349     // [r0 r1 r2 r3 i0 i1 i2 i3  r4 r5 r6 r7 i4 i5 i6 i7  ... ]
350     for (size_t i=0; i<items; i += 8)
351     {
352         // s0' = s0 + s1        = (r0 + r1) + j*(i0 + i1)
353         // s1' = s0 - s1        = (r0 - r1) + j*(i0 - i1)
354         // s2' = s2 + s3        = (r2 + r3) + j*(i2 + i3)
355         // s3' = s2 - s3        = (r2 - r3) + j*(i2 - i3)
356 
357         // s0" = s0' + s2'      = (r0 + r1) + j*(i0 + i1) + (r2 + r3) + j*(i2 + i3)
358         // s1" = s1' + j*s3'    = (r0 - r1) + j*(i0 - i1) - (i2 - i3) + j*(r2 - r3)
359         // s2" = s0' - s2'      = (r0 + r1) + j*(i0 + i1) - (r2 + r3) - j*(i2 + i3)
360         // s3" = s1' - j*s3'    = (r0 - r1) + j*(i0 - i1) + (i2 - i3) - j*(r2 - r3)
361 
362         float r0k       = d[0] + d[1];
363         float r1k       = d[0] - d[1];
364         float r2k       = d[2] + d[3];
365         float r3k       = d[2] - d[3];
366 
367         float i0k       = d[4] + d[5];
368         float i1k       = d[4] - d[5];
369         float i2k       = d[6] + d[7];
370         float i3k       = d[6] - d[7];
371 
372         d[0]            = r0k + r2k;
373         d[1]            = r1k - i3k;
374         d[2]            = r0k - r2k;
375         d[3]            = r1k + i3k;
376 
377         d[4]            = i0k + i2k;
378         d[5]            = i1k + r3k;
379         d[6]            = i0k - i2k;
380         d[7]            = i1k - r3k;
381 
382         d              += 8;
383     }
384 
385     const float *dw     = XFFT_DW;
386     const float *iw_re  = XFFT_A_RE;
387     const float *iw_im  = XFFT_A_IM;
388 
389     // Iterate butterflies
390     while (n < last)
391     {
392 //        lsp_dumpf("iw_re", "%.6f", iw_re, 4);
393 //        lsp_dumpf("iw_im", "%.6f", iw_im, 4);
394 //        lsp_dumpf("dw   ", "%.6f", dw, 2);
395 
396         for (size_t p=0; p<items; p += bs)
397         {
398             // Set initial values of pointers
399             float *a            = &tmp[p];
400             float *b            = &a[n];
401 
402             w_re[0]             = iw_re[0];
403             w_re[1]             = iw_re[1];
404             w_re[2]             = iw_re[2];
405             w_re[3]             = iw_re[3];
406             w_im[0]             = iw_im[0];
407             w_im[1]             = iw_im[1];
408             w_im[2]             = iw_im[2];
409             w_im[3]             = iw_im[3];
410 
411             for (size_t k=0; ;)
412             {
413                 // Calculate complex c = w * b
414                 c_re[0]         = w_re[0] * b[0] - w_im[0] * b[4];
415                 c_re[1]         = w_re[1] * b[1] - w_im[1] * b[5];
416                 c_re[2]         = w_re[2] * b[2] - w_im[2] * b[6];
417                 c_re[3]         = w_re[3] * b[3] - w_im[3] * b[7];
418 
419                 c_im[0]         = w_re[0] * b[4] + w_im[0] * b[0];
420                 c_im[1]         = w_re[1] * b[5] + w_im[1] * b[1];
421                 c_im[2]         = w_re[2] * b[6] + w_im[2] * b[2];
422                 c_im[3]         = w_re[3] * b[7] + w_im[3] * b[3];
423 
424                 // Calculate the output values:
425                 // a'   = a + c
426                 // b'   = a - c
427                 b[0]            = a[0] - c_re[0];
428                 b[1]            = a[1] - c_re[1];
429                 b[2]            = a[2] - c_re[2];
430                 b[3]            = a[3] - c_re[3];
431 
432                 b[4]            = a[4] - c_im[0];
433                 b[5]            = a[5] - c_im[1];
434                 b[6]            = a[6] - c_im[2];
435                 b[7]            = a[7] - c_im[3];
436 
437                 a[0]            = a[0] + c_re[0];
438                 a[1]            = a[1] + c_re[1];
439                 a[2]            = a[2] + c_re[2];
440                 a[3]            = a[3] + c_re[3];
441 
442                 a[4]            = a[4] + c_im[0];
443                 a[5]            = a[5] + c_im[1];
444                 a[6]            = a[6] + c_im[2];
445                 a[7]            = a[7] + c_im[3];
446 
447                 // Update pointers
448                 a              += 8;
449                 b              += 8;
450 
451                 if ((k += 8) >= n)
452                     break;
453 
454                 // Rotate w vector
455                 c_re[0]         = w_re[0]*dw[0] - w_im[0]*dw[1];
456                 c_re[1]         = w_re[1]*dw[0] - w_im[1]*dw[1];
457                 c_re[2]         = w_re[2]*dw[0] - w_im[2]*dw[1];
458                 c_re[3]         = w_re[3]*dw[0] - w_im[3]*dw[1];
459 
460                 c_im[0]         = w_re[0]*dw[1] + w_im[0]*dw[0];
461                 c_im[1]         = w_re[1]*dw[1] + w_im[1]*dw[0];
462                 c_im[2]         = w_re[2]*dw[1] + w_im[2]*dw[0];
463                 c_im[3]         = w_re[3]*dw[1] + w_im[3]*dw[0];
464 
465                 w_re[0]         = c_re[0];
466                 w_re[1]         = c_re[1];
467                 w_re[2]         = c_re[2];
468                 w_re[3]         = c_re[3];
469 
470                 w_im[0]         = c_im[0];
471                 w_im[1]         = c_im[1];
472                 w_im[2]         = c_im[2];
473                 w_im[3]         = c_im[3];
474             }
475         }
476 
477         dw     += 2;
478         iw_re  += 4;
479         iw_im  += 4;
480         n     <<= 1;
481         bs    <<= 1;
482     }
483 
484     if (n < items)
485     {
486         // ONE LARGE CYCLE
487         // Set initial values of pointers
488         float *a            = tmp;
489         float *b            = &a[n];
490         float *d1           = dst;
491         float *d2           = &d1[n>>1];
492         float kn            = 1.0f / last;
493 
494         w_re[0]             = iw_re[0];
495         w_re[1]             = iw_re[1];
496         w_re[2]             = iw_re[2];
497         w_re[3]             = iw_re[3];
498         w_im[0]             = iw_im[0];
499         w_im[1]             = iw_im[1];
500         w_im[2]             = iw_im[2];
501         w_im[3]             = iw_im[3];
502 
503         for (size_t k=0; ;)
504         {
505             // Calculate complex c = w * b
506             c_re[0]         = w_re[0] * b[0] - w_im[0] * b[4];
507             c_re[1]         = w_re[1] * b[1] - w_im[1] * b[5];
508             c_re[2]         = w_re[2] * b[2] - w_im[2] * b[6];
509             c_re[3]         = w_re[3] * b[3] - w_im[3] * b[7];
510 
511             // Calculate the output values:
512             // a'   = a + c
513             // b'   = a - c
514             d1[0]           = kn*(a[0] + c_re[0]);
515             d1[1]           = kn*(a[1] + c_re[1]);
516             d1[2]           = kn*(a[2] + c_re[2]);
517             d1[3]           = kn*(a[3] + c_re[3]);
518 
519             d2[0]           = kn*(a[0] - c_re[0]);
520             d2[1]           = kn*(a[1] - c_re[1]);
521             d2[2]           = kn*(a[2] - c_re[2]);
522             d2[3]           = kn*(a[3] - c_re[3]);
523 
524             // Update pointers
525             a              += 8;
526             b              += 8;
527             d1             += 4;
528             d2             += 4;
529 
530             if ((k += 8) >= n)
531                 break;
532 
533             // Rotate w vector
534             c_re[0]         = w_re[0]*dw[0] - w_im[0]*dw[1];
535             c_re[1]         = w_re[1]*dw[0] - w_im[1]*dw[1];
536             c_re[2]         = w_re[2]*dw[0] - w_im[2]*dw[1];
537             c_re[3]         = w_re[3]*dw[0] - w_im[3]*dw[1];
538 
539             c_im[0]         = w_re[0]*dw[1] + w_im[0]*dw[0];
540             c_im[1]         = w_re[1]*dw[1] + w_im[1]*dw[0];
541             c_im[2]         = w_re[2]*dw[1] + w_im[2]*dw[0];
542             c_im[3]         = w_re[3]*dw[1] + w_im[3]*dw[0];
543 
544             w_re[0]         = c_re[0];
545             w_re[1]         = c_re[1];
546             w_re[2]         = c_re[2];
547             w_re[3]         = c_re[3];
548 
549             w_im[0]         = c_im[0];
550             w_im[1]         = c_im[1];
551             w_im[2]         = c_im[2];
552             w_im[3]         = c_im[3];
553         }
554     }
555     else
556     {
557         // Add real result to the target (ignore complex result)
558         float kn     = 1.0f / last;
559 
560         for (size_t i=0; i<items; i += 8)
561         {
562             dst[0]     += tmp[0] * kn;
563             dst[1]     += tmp[1] * kn;
564             dst[2]     += tmp[2] * kn;
565             dst[3]     += tmp[3] * kn;
566 
567             dst        += 4;
568             tmp        += 8;
569         }
570     }
571 }
572 
573 namespace native
574 {
575     void fastconv_parse(float *dst, const float *src, size_t rank);
576     void fastconv_restore(float *dst, float *tmp, size_t rank);
577     void fastconv_apply(float *dst, float *tmp, const float *c1, const float *c2, size_t rank);
578     void fastconv_parse_apply(float *dst, float *tmp, const float *c, const float *src, size_t rank);
579 }
580 
581 IF_ARCH_X86(
582     namespace sse
583     {
584         void fastconv_parse(float *dst, const float *src, size_t rank);
585         void fastconv_restore(float *dst, float *tmp, size_t rank);
586         void fastconv_apply(float *dst, float *tmp, const float *c1, const float *c2, size_t rank);
587         void fastconv_parse_apply(float *dst, float *tmp, const float *c, const float *src, size_t rank);
588     }
589 
590     namespace avx
591     {
592         void fastconv_parse(float *dst, const float *src, size_t rank);
593         void fastconv_restore(float *dst, float *tmp, size_t rank);
594         void fastconv_apply(float *dst, float *tmp, const float *c1, const float *c2, size_t rank);
595         void fastconv_parse_apply(float *dst, float *tmp, const float *c, const float *src, size_t rank);
596 
597         void fastconv_parse_fma3(float *dst, const float *src, size_t rank);
598         void fastconv_restore_fma3(float *dst, float *tmp, size_t rank);
599         void fastconv_apply_fma3(float *dst, float *tmp, const float *c1, const float *c2, size_t rank);
600         void fastconv_parse_apply_fma3(float *dst, float *tmp, const float *c, const float *src, size_t rank);
601     }
602 )
603 
604 IF_ARCH_ARM(
605     namespace neon_d32
606     {
607         void fastconv_parse(float *dst, const float *src, size_t rank);
608         void fastconv_restore(float *dst, float *tmp, size_t rank);
609         void fastconv_apply(float *dst, float *tmp, const float *c1, const float *c2, size_t rank);
610         void fastconv_parse_apply(float *dst, float *tmp, const float *c, const float *src, size_t rank);
611     }
612 )
613 
614 IF_ARCH_AARCH64(
615     namespace asimd
616     {
617         void fastconv_parse(float *dst, const float *src, size_t rank);
618         void fastconv_restore(float *dst, float *tmp, size_t rank);
619         void fastconv_apply(float *dst, float *tmp, const float *c1, const float *c2, size_t rank);
620         void fastconv_parse_apply(float *dst, float *tmp, const float *c, const float *src, size_t rank);
621     }
622 )
623 
624 typedef void (* fastconv_parse_t)(float *dst, const float *src, size_t rank);
625 typedef void (* fastconv_restore_t)(float *dst, float *tmp, size_t rank);
626 typedef void (* fastconv_apply_t)(float *dst, float *tmp, const float *c1, const float *c2, size_t rank);
627 typedef void (* fastconv_parse_apply_t)(float *dst, float *tmp, const float *c, const float *src, size_t rank);
628 
629 MTEST_BEGIN("dsp.fft", fastconv)
630 
test_fastconv_parse(const char * text,fastconv_parse_t parse,FloatBuffer & src)631     void test_fastconv_parse(const char *text, fastconv_parse_t parse, FloatBuffer &src)
632     {
633         printf("testing %s fastconv_parse...\n", text);
634         FloatBuffer samp(BUF_SIZE >> 1, 64);
635         FloatBuffer conv(BUF_SIZE << 1, 64);
636         samp.copy(src);
637 
638         samp.dump("samp");
639         parse(conv, samp, RANK);
640         conv.dump("conv");
641 
642         MTEST_ASSERT_MSG(samp.valid(), "samp corrupted");
643         MTEST_ASSERT_MSG(conv.valid(), "conv corrupted");
644     }
645 
test_fastconv_restore(const char * text,fastconv_parse_t parse,fastconv_restore_t restore,FloatBuffer & src,FloatBuffer & dst)646     void test_fastconv_restore(const char *text,
647             fastconv_parse_t parse, fastconv_restore_t restore,
648             FloatBuffer &src, FloatBuffer &dst)
649     {
650         printf("testing %s fastconv_parse + fastconv_restore...\n", text);
651         FloatBuffer samp(BUF_SIZE >> 1, 64);
652         FloatBuffer conv(BUF_SIZE << 1, 64);
653         FloatBuffer rest(BUF_SIZE, 64);
654 
655         samp.copy(src);
656         rest.copy(dst);
657         samp.dump("samp");
658         samp.dump("rest");
659 
660         parse(conv, samp, RANK);
661         conv.dump("conv");
662 
663         restore(rest, conv, RANK);
664         conv.dump("rtmp");
665         rest.dump("rest");
666 
667         MTEST_ASSERT_MSG(samp.valid(), "samp corrupted");
668         MTEST_ASSERT_MSG(conv.valid(), "conv corrupted");
669         MTEST_ASSERT_MSG(rest.valid(), "rest corrupted");
670     }
671 
test_fastconv_apply(const char * text,fastconv_parse_t parse,fastconv_apply_t apply,FloatBuffer & src1,FloatBuffer & src2,FloatBuffer & dst)672     void test_fastconv_apply(const char *text,
673             fastconv_parse_t parse, fastconv_apply_t apply,
674             FloatBuffer &src1, FloatBuffer &src2, FloatBuffer &dst)
675     {
676         printf("testing %s fastconv_parse + fastconv_apply...\n", text);
677         FloatBuffer samp1(BUF_SIZE >> 1, 64);
678         FloatBuffer samp2(BUF_SIZE >> 1, 64);
679         FloatBuffer conv1(BUF_SIZE << 1, 64);
680         FloatBuffer conv2(BUF_SIZE << 1, 64);
681         FloatBuffer rest(BUF_SIZE, 64);
682         FloatBuffer temp(BUF_SIZE << 1, 64);
683 
684         samp1.copy(src1);
685         samp2.copy(src2);
686         rest.copy(dst);
687         samp1.dump("samp1");
688         samp2.dump("samp2");
689         rest.dump ("dest ");
690 
691         parse(conv1, samp1, RANK);
692         parse(conv2, samp2, RANK);
693         conv1.dump("conv1");
694         conv2.dump("conv2");
695 
696         apply(rest, temp, conv1, conv2, RANK);
697         temp.dump ("temp ");
698         rest.dump ("rest ");
699 
700         MTEST_ASSERT_MSG(samp1.valid(), "samp1 corrupted");
701         MTEST_ASSERT_MSG(samp2.valid(), "samp2 corrupted");
702         MTEST_ASSERT_MSG(conv1.valid(), "conv1 corrupted");
703         MTEST_ASSERT_MSG(conv2.valid(), "conv2 corrupted");
704         MTEST_ASSERT_MSG(rest.valid(), "rest corrupted");
705         MTEST_ASSERT_MSG(temp.valid(), "temp corrupted");
706     }
707 
test_fastconv_parse_apply(const char * text,fastconv_parse_t parse,fastconv_parse_apply_t papply,FloatBuffer & src1,FloatBuffer & src2,FloatBuffer & dst)708     void test_fastconv_parse_apply(const char *text,
709             fastconv_parse_t parse, fastconv_parse_apply_t papply,
710             FloatBuffer &src1, FloatBuffer &src2, FloatBuffer &dst)
711     {
712         printf("testing %s fastconv_parse + fastconv_parse_apply...\n", text);
713         FloatBuffer samp1(BUF_SIZE >> 1, 64);
714         FloatBuffer samp2(BUF_SIZE >> 1, 64);
715         FloatBuffer conv (BUF_SIZE << 1, 64);
716         FloatBuffer rest(BUF_SIZE, 64);
717         FloatBuffer temp(BUF_SIZE << 1, 64);
718 
719         samp1.copy(src1);
720         samp2.copy(src2);
721         rest.copy(dst);
722         samp1.dump("samp1");
723         samp2.dump("samp2");
724         rest.dump ("dest ");
725 
726         parse(conv, samp1, RANK);
727         conv.dump("conv ");
728 
729         papply(rest, temp, conv, samp2, RANK);
730         temp.dump ("temp ");
731         rest.dump ("rest ");
732 
733         MTEST_ASSERT_MSG(samp1.valid(), "samp1 corrupted");
734         MTEST_ASSERT_MSG(samp2.valid(), "samp2 corrupted");
735         MTEST_ASSERT_MSG(conv.valid(), "conv corrupted");
736         MTEST_ASSERT_MSG(rest.valid(), "rest corrupted");
737         MTEST_ASSERT_MSG(temp.valid(), "temp corrupted");
738     }
739 
740     MTEST_MAIN
741     {
742         FloatBuffer src1(BUF_SIZE >> 1, 64);
743         FloatBuffer src2(BUF_SIZE >> 1, 64);
744         FloatBuffer dst(BUF_SIZE, 64);
745 
746         FloatBuffer samp1(BUF_SIZE >> 1, 64);
747         FloatBuffer samp2(BUF_SIZE >> 1, 64);
748         FloatBuffer conv1(BUF_SIZE << 1, 64);
749         FloatBuffer conv2(BUF_SIZE << 1, 64);
750         FloatBuffer rest1(BUF_SIZE, 64);
751         FloatBuffer rest2(BUF_SIZE, 64);
752         FloatBuffer temp(BUF_SIZE << 1, 64);
753 
754         // Prepare data
755         for (size_t i=0; i<(BUF_SIZE >> 1); ++i)
756         {
757             src1[i]         = i;
758             src2[i]         = 0.1 * i;
759         }
760         for (size_t i=0; i<BUF_SIZE; ++i)
761             dst[i]          = 10.0 * i;
762         samp2.copy(samp1);
763 
764         // Test parse
765         test_fastconv_parse("native", fastconv_parse, src1);
766         IF_ARCH_X86(
767             if (TEST_SUPPORTED(sse::fastconv_parse))
768                 test_fastconv_parse("SSE", sse::fastconv_parse, src1);
769             if (TEST_SUPPORTED(avx::fastconv_parse))
770                 test_fastconv_parse("AVX", avx::fastconv_parse, src1);
771             if (TEST_SUPPORTED(avx::fastconv_parse_fma3))
772                 test_fastconv_parse("FMA3", avx::fastconv_parse_fma3, src1);
773         );
774         IF_ARCH_ARM(
775             if (TEST_SUPPORTED(neon_d32::fastconv_parse))
776                 test_fastconv_parse("NEON-D32", neon_d32::fastconv_parse, src1);
777         );
778         IF_ARCH_AARCH64(
779             if (TEST_SUPPORTED(asimd::fastconv_parse))
780                 test_fastconv_parse("ASIMD", asimd::fastconv_parse, src1);
781         );
782 
783         // Test parse + restore
784         printf("\n");
785         test_fastconv_restore("native", fastconv_parse, fastconv_restore, src1, dst);
786 
787         IF_ARCH_X86(
788             if (TEST_SUPPORTED(sse::fastconv_parse) && TEST_SUPPORTED(sse::fastconv_restore))
789                 test_fastconv_restore("SSE", sse::fastconv_parse, sse::fastconv_restore, src1, dst);
790             if (TEST_SUPPORTED(avx::fastconv_parse) && TEST_SUPPORTED(avx::fastconv_restore))
791                 test_fastconv_restore("AVX", avx::fastconv_parse, avx::fastconv_restore, src1, dst);
792             if (TEST_SUPPORTED(avx::fastconv_parse_fma3) && TEST_SUPPORTED(avx::fastconv_restore_fma3))
793                 test_fastconv_restore("FMA3", avx::fastconv_parse_fma3, avx::fastconv_restore_fma3, src1, dst);
794         );
795         IF_ARCH_ARM(
796             if (TEST_SUPPORTED(neon_d32::fastconv_parse) && TEST_SUPPORTED(neon_d32::fastconv_restore))
797                 test_fastconv_restore("NEON-D32", neon_d32::fastconv_parse, neon_d32::fastconv_restore, src1, dst);
798         );
799         IF_ARCH_AARCH64(
800             if (TEST_SUPPORTED(asimd::fastconv_parse) && TEST_SUPPORTED(asimd::fastconv_restore))
801                 test_fastconv_restore("ASIMD", asimd::fastconv_parse, asimd::fastconv_restore, src1, dst);
802         );
803 
804         // Test parse + apply
805         printf("\n");
806         test_fastconv_apply("native", native::fastconv_parse, native::fastconv_apply, src1, src2, dst);
807 
808         IF_ARCH_X86(
809             if (TEST_SUPPORTED(sse::fastconv_parse) && TEST_SUPPORTED(sse::fastconv_apply))
810                 test_fastconv_apply("SSE", sse::fastconv_parse, sse::fastconv_apply, src1, src2, dst);
811             if (TEST_SUPPORTED(avx::fastconv_parse) && TEST_SUPPORTED(avx::fastconv_apply))
812                 test_fastconv_apply("AVX", avx::fastconv_parse, avx::fastconv_apply, src1, src2, dst);
813             if (TEST_SUPPORTED(avx::fastconv_parse_fma3) && TEST_SUPPORTED(avx::fastconv_apply_fma3))
814                 test_fastconv_apply("FMA3", avx::fastconv_parse_fma3, avx::fastconv_apply_fma3, src1, src2, dst);
815         );
816         IF_ARCH_ARM(
817             if (TEST_SUPPORTED(neon_d32::fastconv_parse) && TEST_SUPPORTED(neon_d32::fastconv_apply))
818                 test_fastconv_apply("NEON-D32", neon_d32::fastconv_parse, neon_d32::fastconv_apply, src1, src2, dst);
819         );
820         IF_ARCH_AARCH64(
821             if (TEST_SUPPORTED(asimd::fastconv_parse) && TEST_SUPPORTED(asimd::fastconv_apply))
822                 test_fastconv_apply("ASIMD", asimd::fastconv_parse, asimd::fastconv_apply, src1, src2, dst);
823         );
824 
825         // Test parse + parse_apply
826         printf("\n");
827         test_fastconv_parse_apply("native", native::fastconv_parse, native::fastconv_parse_apply, src1, src2, dst);
828 
829         IF_ARCH_X86(
830             if (TEST_SUPPORTED(sse::fastconv_parse) && TEST_SUPPORTED(sse::fastconv_parse_apply))
831                 test_fastconv_parse_apply("SSE", sse::fastconv_parse, sse::fastconv_parse_apply, src1, src2, dst);
832             if (TEST_SUPPORTED(avx::fastconv_parse) && TEST_SUPPORTED(avx::fastconv_parse_apply))
833                 test_fastconv_parse_apply("AVX", avx::fastconv_parse, avx::fastconv_parse_apply, src1, src2, dst);
834             if (TEST_SUPPORTED(avx::fastconv_parse_fma3) && TEST_SUPPORTED(avx::fastconv_parse_apply_fma3))
835                 test_fastconv_parse_apply("FMA3", avx::fastconv_parse_fma3, avx::fastconv_parse_apply_fma3, src1, src2, dst);
836         );
837 
838         IF_ARCH_ARM(
839             if (TEST_SUPPORTED(neon_d32::fastconv_parse) && TEST_SUPPORTED(neon_d32::fastconv_parse_apply))
840                 test_fastconv_parse_apply("NEON-D32", neon_d32::fastconv_parse, neon_d32::fastconv_parse_apply, src1, src2, dst);
841         );
842 
843         IF_ARCH_AARCH64(
844             if (TEST_SUPPORTED(asimd::fastconv_parse) && TEST_SUPPORTED(asimd::fastconv_parse_apply))
845                 test_fastconv_parse_apply("ASIMD", asimd::fastconv_parse, asimd::fastconv_parse_apply, src1, src2, dst);
846         );
847     }
848 MTEST_END
849 
850