1 /*
2  * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3  *           (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com>
4  *
5  * This file is part of lsp-plugins
6  * Created on: 23 дек. 2019 г.
7  *
8  * lsp-plugins is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * any later version.
12  *
13  * lsp-plugins is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>.
20  */
21 
22 #ifndef DSP_ARCH_X86_AVX2_FFT_NORMALIZE_H_
23 #define DSP_ARCH_X86_AVX2_FFT_NORMALIZE_H_
24 
25 #ifndef DSP_ARCH_X86_AVX2_IMPL
26     #error "This header should not be included directly"
27 #endif /* DSP_ARCH_X86_AVX_IMPL */
28 
29 namespace avx2
30 {
normalize_fft3(float * dre,float * dim,const float * re,const float * im,size_t rank)31     void normalize_fft3(float *dre, float *dim, const float *re, const float *im, size_t rank)
32     {
33         IF_ARCH_X86(
34             float k = 1.0f/(1 << rank);
35             size_t count = 1 << rank, off = 0;
36         );
37         ARCH_X86_ASM(
38             // x16 blocks
39             __ASM_EMIT  ("vbroadcastss  %%xmm0, %%ymm0")                 // ymm0   = k
40             __ASM_EMIT32("subl          $16, %[count]")
41             __ASM_EMIT64("sub           $16, %[count]")
42             __ASM_EMIT  ("vmovaps       %%ymm0, %%ymm1")
43             __ASM_EMIT  ("jb            2f")
44             __ASM_EMIT  ("1:")
45             __ASM_EMIT  ("vmulps        0x00(%[s_re], %[off]), %%ymm0, %%ymm4")
46             __ASM_EMIT  ("vmulps        0x20(%[s_re], %[off]), %%ymm1, %%ymm5")
47             __ASM_EMIT  ("vmulps        0x00(%[s_im], %[off]), %%ymm0, %%ymm6")
48             __ASM_EMIT  ("vmulps        0x20(%[s_im], %[off]), %%ymm1, %%ymm7")
49             __ASM_EMIT  ("vmovups       %%ymm4, 0x00(%[d_re], %[off])")
50             __ASM_EMIT  ("vmovups       %%ymm5, 0x20(%[d_re], %[off])")
51             __ASM_EMIT  ("vmovups       %%ymm6, 0x00(%[d_im], %[off])")
52             __ASM_EMIT  ("vmovups       %%ymm7, 0x20(%[d_im], %[off])")
53             __ASM_EMIT  ("add           $0x40, %[off]")
54             __ASM_EMIT32("subl          $16, %[count]")
55             __ASM_EMIT64("sub           $16, %[count]")
56             __ASM_EMIT  ("jae           1b")
57             __ASM_EMIT  ("2:")
58             // x8 block
59             __ASM_EMIT32("addl          $8, %[count]")
60             __ASM_EMIT64("add           $8, %[count]")
61             __ASM_EMIT  ("jl            4f")
62             __ASM_EMIT  ("vmulps        0x00(%[s_re], %[off]), %%ymm0, %%ymm4")
63             __ASM_EMIT  ("vmulps        0x00(%[s_im], %[off]), %%ymm1, %%ymm6")
64             __ASM_EMIT  ("vmovups       %%ymm4, 0x00(%[d_re], %[off])")
65             __ASM_EMIT  ("vmovups       %%ymm6, 0x00(%[d_im], %[off])")
66             __ASM_EMIT  ("4:")
67             : [off] "+r" (off), [count] __ASM_ARG_RW(count),
68               [k] "+Yz" (k)
69             : [s_re] "r" (re), [s_im] "r" (im),
70               [d_re] "r" (dre), [d_im] "r" (dim)
71             : "cc", "memory",
72               "%xmm1",
73               "%xmm4", "%xmm5", "%xmm6", "%xmm7"
74         );
75     }
76 
normalize_fft2(float * re,float * im,size_t rank)77     void normalize_fft2(float *re, float *im, size_t rank)
78     {
79         IF_ARCH_X86(
80             float k = 1.0f/(1 << rank);
81             size_t count = 1 << rank, off = 0;
82         );
83         ARCH_X86_ASM(
84             // x16 blocks
85             __ASM_EMIT  ("vbroadcastss  %%xmm0, %%ymm0")                 // ymm0   = k
86             __ASM_EMIT32("subl          $16, %[count]")
87             __ASM_EMIT64("sub           $16, %[count]")
88             __ASM_EMIT  ("vmovaps       %%ymm0, %%ymm1")
89             __ASM_EMIT  ("jb            2f")
90             __ASM_EMIT  ("1:")
91             __ASM_EMIT  ("vmulps        0x00(%[d_re], %[off]), %%ymm0, %%ymm4")
92             __ASM_EMIT  ("vmulps        0x20(%[d_re], %[off]), %%ymm1, %%ymm5")
93             __ASM_EMIT  ("vmulps        0x00(%[d_im], %[off]), %%ymm0, %%ymm6")
94             __ASM_EMIT  ("vmulps        0x20(%[d_im], %[off]), %%ymm1, %%ymm7")
95             __ASM_EMIT  ("vmovups       %%ymm4, 0x00(%[d_re], %[off])")
96             __ASM_EMIT  ("vmovups       %%ymm5, 0x20(%[d_re], %[off])")
97             __ASM_EMIT  ("vmovups       %%ymm6, 0x00(%[d_im], %[off])")
98             __ASM_EMIT  ("vmovups       %%ymm7, 0x20(%[d_im], %[off])")
99             __ASM_EMIT  ("add           $0x40, %[off]")
100             __ASM_EMIT32("subl          $16, %[count]")
101             __ASM_EMIT64("sub           $16, %[count]")
102             __ASM_EMIT  ("jae           1b")
103             __ASM_EMIT  ("2:")
104             // x8 block
105             __ASM_EMIT32("addl          $8, %[count]")
106             __ASM_EMIT64("add           $8, %[count]")
107             __ASM_EMIT  ("jl            4f")
108             __ASM_EMIT  ("vmulps        0x00(%[d_re], %[off]), %%ymm0, %%ymm4")
109             __ASM_EMIT  ("vmulps        0x00(%[d_im], %[off]), %%ymm1, %%ymm6")
110             __ASM_EMIT  ("vmovups       %%ymm4, 0x00(%[d_re], %[off])")
111             __ASM_EMIT  ("vmovups       %%ymm6, 0x00(%[d_im], %[off])")
112             __ASM_EMIT  ("4:")
113             : [off] "+r" (off), [count] "+r" (count),
114               [k] "+Yz" (k)
115             : [d_re] "r" (re), [d_im] "r" (im)
116             : "cc", "memory",
117               "%xmm1",
118               "%xmm4", "%xmm5", "%xmm6", "%xmm7"
119         );
120     }
121 }
122 
123 
124 #endif /* DSP_ARCH_X86_AVX2_FFT_NORMALIZE_H_ */
125