1 /*
2  * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3  *           (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com>
4  *
5  * This file is part of lsp-plugins
6  * Created on: 13 дек. 2019 г.
7  *
8  * lsp-plugins is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * any later version.
12  *
13  * lsp-plugins is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>.
20  */
21 
22 #ifndef DSP_ARCH_X86_AVX_IMPL
23     #error "This header should not be included directly"
24 #endif /* DSP_ARCH_X86_AVX_IMPL */
25 
26 #define __SKIP(x)
27 
28 // c    = a - b
29 // a'   = a + b
30 // b'   = c * w
31 #define FASTCONV_DIRECT_BUTTERFLY_BODY8(add_re, add_im, FMA_SEL) \
32     ARCH_X86_ASM \
33     ( \
34         /* Prepare angle */ \
35         __ASM_EMIT("vmovaps         0x00(%[ak]), %%ymm6")               /* ymm6 = x_re */ \
36         __ASM_EMIT("vmovaps         0x20(%[ak]), %%ymm7")               /* ymm7 = x_im */ \
37         /* Start loop */ \
38         __ASM_EMIT("1:") \
39             __ASM_EMIT("vmovups         0x00(%[dst], %[off1]), %%ymm0")     /* ymm0 = a_re */ \
40             __ASM_EMIT("vmovups         0x20(%[dst], %[off1]), %%ymm1")     /* ymm1 = a_im */ \
41             __ASM_EMIT("vmovups         0x00(%[dst], %[off2]), %%ymm2")     /* ymm2 = b_re */ \
42             __ASM_EMIT("vmovups         0x20(%[dst], %[off2]), %%ymm3")     /* ymm3 = b_im */ \
43             /* Perform butterfly */ \
44             __ASM_EMIT("vsubps          %%ymm2, %%ymm0, %%ymm4")                /* ymm4 = c_re  = a_re - b_re */ \
45             __ASM_EMIT("vsubps          %%ymm3, %%ymm1, %%ymm5")                /* ymm5 = c_im  = a_im - b_im */ \
46             __ASM_EMIT("vaddps          %%ymm2, %%ymm0, %%ymm0")                /* ymm0 = a_re' = a_re + b_re */ \
47             __ASM_EMIT("vaddps          %%ymm3, %%ymm1, %%ymm1")                /* ymm1 = a_im' = a_im + b_im */ \
48             __ASM_EMIT("vmulps          %%ymm7, %%ymm4, %%ymm2")                /* ymm2 = x_im * c_re */ \
49             __ASM_EMIT("vmulps          %%ymm7, %%ymm5, %%ymm3")                /* ymm3 = x_im * c_im */ \
50             __ASM_EMIT(FMA_SEL("vmulps  %%ymm6, %%ymm4, %%ymm4", ""))           /* ymm4 = x_re * c_re */ \
51             __ASM_EMIT(FMA_SEL("vmulps  %%ymm6, %%ymm5, %%ymm5", ""))           /* ymm5 = x_re * c_im */ \
52             __ASM_EMIT(FMA_SEL(add_re "  %%ymm3, %%ymm4, %%ymm4", add_re " %%ymm6, %%ymm3, %%ymm4")) /* ymm4 = b_re = x_re * c_re +- x_im * c_im */ \
53             __ASM_EMIT(FMA_SEL(add_im "  %%ymm2, %%ymm5, %%ymm5", add_im " %%ymm6, %%ymm2, %%ymm5")) /* ymm5 = b_im = x_re * c_im -+ x_im * c_re */ \
54             /* Store values */ \
55             __ASM_EMIT("vmovups         %%ymm0, 0x00(%[dst], %[off1])") \
56             __ASM_EMIT("vmovups         %%ymm1, 0x20(%[dst], %[off1])") \
57             __ASM_EMIT("vmovups         %%ymm4, 0x00(%[dst], %[off2])") \
58             __ASM_EMIT("vmovups         %%ymm5, 0x20(%[dst], %[off2])") \
59             __ASM_EMIT("add             $0x40, %[off1]") \
60             __ASM_EMIT("add             $0x40, %[off2]") \
61             __ASM_EMIT32("subl          $8, %[np]") \
62             __ASM_EMIT64("subq          $8, %[np]") \
63             __ASM_EMIT("jz              2f") \
64             /* Rotate angle */ \
65             __ASM_EMIT("vmovaps         0x00(%[wk]), %%ymm4")               /* xmm4 = w_re */ \
66             __ASM_EMIT("vmovaps         0x20(%[wk]), %%ymm5")               /* xmm5 = w_im */ \
67             __ASM_EMIT("vmulps          %%ymm5, %%ymm6, %%ymm2")            /* ymm2 = w_im * x_re */ \
68             __ASM_EMIT("vmulps          %%ymm5, %%ymm7, %%ymm3")            /* ymm3 = w_im * x_im */ \
69             __ASM_EMIT(FMA_SEL("vmulps  %%ymm4, %%ymm6, %%ymm6", ""))       /* ymm6 = w_re * x_re */ \
70             __ASM_EMIT(FMA_SEL("vmulps  %%ymm4, %%ymm7, %%ymm7", ""))       /* ymm7 = w_re * x_im */ \
71             __ASM_EMIT(FMA_SEL("vsubps  %%ymm3, %%ymm6, %%ymm6", "vfmsub132ps %%ymm4, %%ymm3, %%ymm6")) /* ymm6 = x_re' = w_re * x_re - w_im * x_im */ \
72             __ASM_EMIT(FMA_SEL("vaddps  %%ymm2, %%ymm7, %%ymm7", "vfmadd132ps %%ymm4, %%ymm2, %%ymm7")) /* ymm7 = x_im' = w_re * x_im + w_im * x_re */ \
73             /* Repeat loop */ \
74         __ASM_EMIT("jmp             1b") \
75         __ASM_EMIT("2:") \
76         \
77         : [off1] "+r" (off1), [off2] "+r" (off2), [np] __ASM_ARG_RW(np) \
78         : [dst] "r" (dst), [ak] "r" (ak), [wk] "r" (wk) \
79         : "cc", "memory",  \
80         "%xmm0", "%xmm1", "%xmm2", "%xmm3", \
81         "%xmm4", "%xmm5", "%xmm6", "%xmm7" \
82     );
83 
84 
85 #define FASTCONV_DIRECT_BUTTERFLY_LAST(add_re, add_im, FMA_SEL) \
86     ARCH_X86_ASM( \
87         /* Loop 2x 4-element butterflies */ \
88         __ASM_EMIT("vmovaps         0x00 + %[FFT_A], %%ymm6")       /* ymm6 = x_re */ \
89         __ASM_EMIT("vmovaps         0x20 + %[FFT_A], %%ymm7")       /* ymm7 = x_im */ \
90         __ASM_EMIT("sub             $2, %[nb]") \
91         __ASM_EMIT("jb              2f") \
92         __ASM_EMIT("1:") \
93             /* Load data to registers */ \
94             __ASM_EMIT("vmovups         0x00(%[dst]), %%xmm0")                  /* xmm0 = r0  r1  r2  r3 */ \
95             __ASM_EMIT("vmovups         0x10(%[dst]), %%xmm2")                  /* xmm2 = r4  r5  r6  r7 */ \
96             __ASM_EMIT("vmovups         0x20(%[dst]), %%xmm1")                  /* xmm1 = i0  i1  i2  i3 */ \
97             __ASM_EMIT("vmovups         0x30(%[dst]), %%xmm3")                  /* xmm3 = i4  i5  i6  i7 */ \
98             __ASM_EMIT("vinsertf128     $1, 0x40(%[dst]), %%ymm0, %%ymm0")      /* ymm0 = a_re = r0  r1  r2  r3  r8  r9  r10 r11 */ \
99             __ASM_EMIT("vinsertf128     $1, 0x50(%[dst]), %%ymm2, %%ymm2")      /* ymm2 = b_re = r4  r5  r6  r7  r12 r13 r14 r15 */ \
100             __ASM_EMIT("vinsertf128     $1, 0x60(%[dst]), %%ymm1, %%ymm1")      /* ymm1 = a_im = i0  i1  i2  i3  i8  i9  i10 i11 */ \
101             __ASM_EMIT("vinsertf128     $1, 0x70(%[dst]), %%ymm3, %%ymm3")      /* ymm3 = b_im = i4  i5  i6  i7  i12 i13 i14 i15 */ \
102             /* Perform 3rd-order butterflies */ \
103             __ASM_EMIT("vsubps          %%ymm2, %%ymm0, %%ymm4")                /* ymm4 = c_re  = a_re - b_re */ \
104             __ASM_EMIT("vsubps          %%ymm3, %%ymm1, %%ymm5")                /* ymm5 = c_im  = a_im - b_im */ \
105             __ASM_EMIT("vaddps          %%ymm2, %%ymm0, %%ymm0")                /* ymm0 = a_re' = a_re + b_re */ \
106             __ASM_EMIT("vaddps          %%ymm3, %%ymm1, %%ymm1")                /* ymm1 = a_im' = a_im + b_im */ \
107             __ASM_EMIT("vmulps          %%ymm7, %%ymm4, %%ymm2")                /* ymm2 = x_im * c_re */ \
108             __ASM_EMIT("vmulps          %%ymm7, %%ymm5, %%ymm3")                /* ymm3 = x_im * c_im */ \
109             __ASM_EMIT(FMA_SEL("vmulps  %%ymm6, %%ymm4, %%ymm4", ""))           /* ymm4 = x_re * c_re */ \
110             __ASM_EMIT(FMA_SEL("vmulps  %%ymm6, %%ymm5, %%ymm5", ""))           /* ymm5 = x_re * c_im */ \
111             __ASM_EMIT(FMA_SEL(add_re "  %%ymm3, %%ymm4, %%ymm4", add_re " %%ymm6, %%ymm3, %%ymm4")) /* ymm4 = b_re = x_re * c_re +- x_im * c_im */ \
112             __ASM_EMIT(FMA_SEL(add_im "  %%ymm2, %%ymm5, %%ymm5", add_im " %%ymm6, %%ymm2, %%ymm5")) /* ymm5 = b_im = x_re * c_im -+ x_im * c_re */ \
113             /* 2nd-order butterflies */ \
114             /* s0" = (r0 + r2) + j*(i0 + i2) + (r1 + r3) + j*(i1 + i3) */ \
115             /* s1" = (r0 + r2) + j*(i0 + i2) - (r1 + r3) - j*(i1 + i3) */ \
116             /* s2" = (r0 - r2) + j*(i0 - i2) + (i1 - i3) - j*(r1 - r3) */ \
117             /* s3" = (r0 - r2) + j*(i0 - i2) - (i1 - i3) + j*(r1 - r3) */ \
118             /* ymm0         = r0  r1  r2  r3 ... */ \
119             /* ymm1         = i0  i1  i2  i3 ... */ \
120             /* ymm4         = r4  r5  r6  r7 ... */ \
121             /* ymm5         = i4  i5  i6  i7 ... */ \
122             __ASM_EMIT("vshufps         $0xd8, %%ymm0, %%ymm0, %%ymm0")         /* ymm0 = r0 r2 r1 r3 */ \
123             __ASM_EMIT("vshufps         $0xd8, %%ymm1, %%ymm1, %%ymm1")         /* ymm1 = i0 i2 i1 i3 */ \
124             __ASM_EMIT("vshufps         $0xd8, %%ymm4, %%ymm4, %%ymm4") \
125             __ASM_EMIT("vshufps         $0xd8, %%ymm5, %%ymm5, %%ymm5") \
126             __ASM_EMIT("vhsubps         %%ymm1, %%ymm0, %%ymm2")                /* ymm2 = r0-r2 r1-r3 i0-i2 i1-i3 = r1' r3' i1' i3' */ \
127             __ASM_EMIT("vhsubps         %%ymm5, %%ymm4, %%ymm3") \
128             __ASM_EMIT("vhaddps         %%ymm1, %%ymm0, %%ymm0")                /* ymm0 = r0+r2 r1+r3 i0+i2 i1+i3 = r0' r2' i0' i2' */ \
129             __ASM_EMIT("vhaddps         %%ymm5, %%ymm4, %%ymm4") \
130             /* 1st-order 8x butterfly */ \
131             __ASM_EMIT("vshufps         $0x6e, %%ymm2, %%ymm0, %%ymm1")         /* ymm0 = i0' i2' i1' r3' */ \
132             __ASM_EMIT("vshufps         $0x6e, %%ymm3, %%ymm4, %%ymm5") \
133             __ASM_EMIT("vshufps         $0xc4, %%ymm2, %%ymm0, %%ymm0")         /* ymm1 = r0' r2' r1' i3' */ \
134             __ASM_EMIT("vshufps         $0xc4, %%ymm3, %%ymm4, %%ymm4") \
135             __ASM_EMIT("vhsubps         %%ymm1, %%ymm0, %%ymm2")                /* ymm2 = r0'-r2' r1'-i3' i0'-i2' i1'-r3' = r1" r3" i1" i2" */ \
136             __ASM_EMIT("vhsubps         %%ymm5, %%ymm4, %%ymm3") \
137             __ASM_EMIT("vhaddps         %%ymm1, %%ymm0, %%ymm0")                /* ymm0 = r0'+r2' r1'+i3' i0'+i2' i1'+r3' = r0" r2" i0" i3" */ \
138             __ASM_EMIT("vhaddps         %%ymm5, %%ymm4, %%ymm4") \
139             __ASM_EMIT("vblendps        $0x88, %%ymm0, %%ymm2, %%ymm1")         /* ymm1 = r1" r3" i1" i3" */ \
140             __ASM_EMIT("vblendps        $0x88, %%ymm4, %%ymm3, %%ymm5") \
141             __ASM_EMIT("vblendps        $0x88, %%ymm2, %%ymm0, %%ymm0")         /* ymm0 = r0" r2" i0" i2" */ \
142             __ASM_EMIT("vblendps        $0x88, %%ymm3, %%ymm4, %%ymm4") \
143             __ASM_EMIT("vunpckhps       %%ymm1, %%ymm0, %%ymm3")                /* ymm3 = r0" r1" r2" r3" */ \
144             __ASM_EMIT("vunpcklps       %%ymm1, %%ymm0, %%ymm2")                /* ymm2 = i0" i1" i2" i3" */ \
145             __ASM_EMIT("vunpckhps       %%ymm5, %%ymm4, %%ymm1") \
146             __ASM_EMIT("vunpcklps       %%ymm5, %%ymm4, %%ymm0") \
147             /* Store */ \
148             __ASM_EMIT("vmovups         %%xmm2, 0x00(%[dst])") \
149             __ASM_EMIT("vmovups         %%xmm0, 0x10(%[dst])") \
150             __ASM_EMIT("vmovups         %%xmm3, 0x20(%[dst])") \
151             __ASM_EMIT("vmovups         %%xmm1, 0x30(%[dst])") \
152             __ASM_EMIT("vextractf128    $1, %%ymm2, 0x40(%[dst])") \
153             __ASM_EMIT("vextractf128    $1, %%ymm0, 0x50(%[dst])") \
154             __ASM_EMIT("vextractf128    $1, %%ymm3, 0x60(%[dst])") \
155             __ASM_EMIT("vextractf128    $1, %%ymm1, 0x70(%[dst])") \
156             /* Move pointers and repeat*/ \
157             __ASM_EMIT("add             $0x80, %[dst]") \
158             __ASM_EMIT("sub             $2, %[nb]") \
159             __ASM_EMIT("jae             1b") \
160         __ASM_EMIT("2:") \
161         /* 1x 4-element butterflies */ \
162         __ASM_EMIT("add             $1, %[nb]") \
163         __ASM_EMIT("jl              4f") \
164             __ASM_EMIT("vmovups         0x00(%[dst]), %%xmm0")                  /* xmm0 = r0  r1  r2  r3 */ \
165             __ASM_EMIT("vmovups         0x10(%[dst]), %%xmm2")                  /* xmm2 = r4  r5  r6  r7 */ \
166             __ASM_EMIT("vmovups         0x20(%[dst]), %%xmm1")                  /* xmm1 = i0  i1  i2  i3 */ \
167             __ASM_EMIT("vmovups         0x30(%[dst]), %%xmm3")                  /* xmm3 = i4  i5  i6  i7 */ \
168             /* Perform 3rd-order 8x butterfly */ \
169             __ASM_EMIT("vsubps          %%xmm2, %%xmm0, %%xmm4")                /* xmm4 = c_re  = a_re - b_re */ \
170             __ASM_EMIT("vsubps          %%xmm3, %%xmm1, %%xmm5")                /* xmm5 = c_im  = a_im - b_im */ \
171             __ASM_EMIT("vaddps          %%xmm2, %%xmm0, %%xmm0")                /* xmm0 = a_re' = a_re + b_re */ \
172             __ASM_EMIT("vaddps          %%xmm3, %%xmm1, %%xmm1")                /* xmm1 = a_im' = a_im + b_im */ \
173             __ASM_EMIT("vmulps          %%xmm7, %%xmm4, %%xmm2")                /* xmm2 = x_im * c_re */ \
174             __ASM_EMIT("vmulps          %%xmm7, %%xmm5, %%xmm3")                /* xmm3 = x_im * c_im */ \
175             __ASM_EMIT(FMA_SEL("vmulps  %%xmm6, %%xmm4, %%xmm4", ""))           /* xmm4 = x_re * c_re */ \
176             __ASM_EMIT(FMA_SEL("vmulps  %%xmm6, %%xmm5, %%xmm5", ""))           /* xmm5 = x_re * c_im */ \
177             __ASM_EMIT(FMA_SEL(add_re "  %%xmm3, %%xmm4, %%xmm4", add_re " %%xmm6, %%xmm3, %%xmm4")) /* xmm4 = b_re = x_re * c_re +- x_im * c_im */ \
178             __ASM_EMIT(FMA_SEL(add_im "  %%xmm2, %%xmm5, %%xmm5", add_im " %%xmm6, %%xmm2, %%xmm5")) /* xmm5 = b_im = x_re * c_im -+ x_im * c_re */ \
179             /* 2nd-order butterflies */ \
180             /* s0" = (r0 + r2) + j*(i0 + i2) + (r1 + r3) + j*(i1 + i3) */ \
181             /* s1" = (r0 + r2) + j*(i0 + i2) - (r1 + r3) - j*(i1 + i3) */ \
182             /* s2" = (r0 - r2) + j*(i0 - i2) + (i1 - i3) - j*(r1 - r3) */ \
183             /* s3" = (r0 - r2) + j*(i0 - i2) - (i1 - i3) + j*(r1 - r3) */ \
184             /* xmm0         = r0  r1  r2  r3 ... */ \
185             /* xmm1         = i0  i1  i2  i3 ... */ \
186             /* xmm4         = r4  r5  r6  r7 ... */ \
187             /* xmm5         = i4  i5  i6  i7 ... */ \
188             __ASM_EMIT("vshufps         $0xd8, %%xmm0, %%xmm0, %%xmm0")         /* xmm0 = r0 r2 r1 r3 */ \
189             __ASM_EMIT("vshufps         $0xd8, %%xmm1, %%xmm1, %%xmm1")         /* xmm1 = i0 i2 i1 i3 */ \
190             __ASM_EMIT("vshufps         $0xd8, %%xmm4, %%xmm4, %%xmm4") \
191             __ASM_EMIT("vshufps         $0xd8, %%xmm5, %%xmm5, %%xmm5") \
192             __ASM_EMIT("vhsubps         %%xmm1, %%xmm0, %%xmm2")                /* xmm2 = r0-r2 r1-r3 i0-i2 i1-i3 = r1' r3' i1' i3' */ \
193             __ASM_EMIT("vhsubps         %%xmm5, %%xmm4, %%xmm3") \
194             __ASM_EMIT("vhaddps         %%xmm1, %%xmm0, %%xmm0")                /* xmm0 = r0+r2 r1+r3 i0+i2 i1+i3 = r0' r2' i0' i2' */ \
195             __ASM_EMIT("vhaddps         %%xmm5, %%xmm4, %%xmm4") \
196             /* 1st-order butterflies */ \
197             __ASM_EMIT("vshufps         $0x6e, %%xmm2, %%xmm0, %%xmm1")         /* xmm0 = i0' i2' i1' r3' */ \
198             __ASM_EMIT("vshufps         $0x6e, %%xmm3, %%xmm4, %%xmm5") \
199             __ASM_EMIT("vshufps         $0xc4, %%xmm2, %%xmm0, %%xmm0")         /* xmm1 = r0' r2' r1' i3' */ \
200             __ASM_EMIT("vshufps         $0xc4, %%xmm3, %%xmm4, %%xmm4") \
201             __ASM_EMIT("vmovups         %%xmm0, 0x00(%[dst])") \
202             __ASM_EMIT("vmovups         %%xmm1, 0x10(%[dst])") \
203             __ASM_EMIT("vmovups         %%xmm4, 0x20(%[dst])") \
204             __ASM_EMIT("vmovups         %%xmm5, 0x30(%[dst])") \
205             __ASM_EMIT("vhsubps         %%xmm1, %%xmm0, %%xmm2")                /* xmm2 = r0'-r2' r1'-i3' i0'-i2' i1'-r3' = r1" r3" i1" i2" */ \
206             __ASM_EMIT("vhsubps         %%xmm5, %%xmm4, %%xmm3") \
207             __ASM_EMIT("vhaddps         %%xmm1, %%xmm0, %%xmm0")                /* xmm0 = r0'+r2' r1'+i3' i0'+i2' i1'+r3' = r0" r2" i0" i3" */ \
208             __ASM_EMIT("vhaddps         %%xmm5, %%xmm4, %%xmm4") \
209             __ASM_EMIT("vblendps        $0x88, %%xmm0, %%xmm2, %%xmm1")         /* xmm1 = r1" r3" i1" i3" */ \
210             __ASM_EMIT("vblendps        $0x88, %%xmm4, %%xmm3, %%xmm5") \
211             __ASM_EMIT("vblendps        $0x88, %%xmm2, %%xmm0, %%xmm0")         /* xmm0 = r0" r2" i0" i2" */ \
212             __ASM_EMIT("vblendps        $0x88, %%xmm3, %%xmm4, %%xmm4") \
213             __ASM_EMIT("vunpckhps       %%xmm1, %%xmm0, %%xmm3")                /* xmm3 = i0" i1" i2" i3" */ \
214             __ASM_EMIT("vunpcklps       %%xmm1, %%xmm0, %%xmm2")                /* xmm2 = r0" r1" r2" r3" */ \
215             __ASM_EMIT("vunpckhps       %%xmm5, %%xmm4, %%xmm1") \
216             __ASM_EMIT("vunpcklps       %%xmm5, %%xmm4, %%xmm0") \
217             /* Store */ \
218             __ASM_EMIT("vmovups         %%xmm2, 0x00(%[dst])") \
219             __ASM_EMIT("vmovups         %%xmm0, 0x10(%[dst])") \
220             __ASM_EMIT("vmovups         %%xmm3, 0x20(%[dst])") \
221             __ASM_EMIT("vmovups         %%xmm1, 0x30(%[dst])") \
222         __ASM_EMIT("4:") \
223         \
224         : [dst] "+r" (dst), [nb] "+r" (nb) \
225         : [FFT_A] "o" (FFT_A) \
226         : "cc", "memory", \
227           "%xmm0", "%xmm1", "%xmm2", "%xmm3", \
228           "%xmm4", "%xmm5", "%xmm6", "%xmm7" \
229     );
230 
231 #define FASTCONV_REVERSE_BUTTERFLY_BODY8(add_re, add_im, FMA_SEL) \
232     ARCH_X86_ASM \
233     ( \
234         /* Prepare angle */ \
235         __ASM_EMIT("vmovaps         0x00(%[ak]), %%ymm6")               /* ymm6 = x_re */ \
236         __ASM_EMIT("vmovaps         0x20(%[ak]), %%ymm7")               /* ymm7 = x_im */ \
237         /* Start loop */ \
238         __ASM_EMIT("1:") \
239             __ASM_EMIT("vmovups         0x00(%[dst], %[off1]), %%ymm0")     /* ymm0 = a_re */ \
240             __ASM_EMIT("vmovups         0x20(%[dst], %[off1]), %%ymm1")     /* ymm1 = a_im */ \
241             __ASM_EMIT("vmovups         0x00(%[dst], %[off2]), %%ymm2")     /* ymm2 = b_re */ \
242             __ASM_EMIT("vmovups         0x20(%[dst], %[off2]), %%ymm3")     /* ymm3 = b_im */ \
243             /* Calculate complex multiplication */ \
244             __ASM_EMIT("vmulps          %%ymm7, %%ymm2, %%ymm4")            /* ymm4 = x_im * b_re */ \
245             __ASM_EMIT("vmulps          %%ymm7, %%ymm3, %%ymm5")            /* ymm5 = x_im * b_im */ \
246             __ASM_EMIT(FMA_SEL("vmulps  %%ymm6, %%ymm2, %%ymm2", ""))       /* ymm2 = x_re * b_re */ \
247             __ASM_EMIT(FMA_SEL("vmulps  %%ymm6, %%ymm3, %%ymm3", ""))       /* ymm3 = x_re * b_im */ \
248             __ASM_EMIT(FMA_SEL(add_re "  %%ymm5, %%ymm2, %%ymm5", add_re " %%ymm6, %%ymm2, %%ymm5")) /* ymm5 = c_re = x_re * b_re +- x_im * b_im */ \
249             __ASM_EMIT(FMA_SEL(add_im "  %%ymm4, %%ymm3, %%ymm4", add_im " %%ymm6, %%ymm3, %%ymm4")) /* ymm4 = c_im = x_re * b_im -+ x_im * b_re */ \
250             /* Perform butterfly */ \
251             __ASM_EMIT("vsubps          %%ymm5, %%ymm0, %%ymm2")            /* ymm2 = a_re - c_re */ \
252             __ASM_EMIT("vsubps          %%ymm4, %%ymm1, %%ymm3")            /* ymm3 = a_im - c_im */ \
253             __ASM_EMIT("vaddps          %%ymm5, %%ymm0, %%ymm0")            /* ymm0 = a_re + c_re */ \
254             __ASM_EMIT("vaddps          %%ymm4, %%ymm1, %%ymm1")            /* ymm1 = a_im + c_im */ \
255             /* Store values */ \
256             __ASM_EMIT("vmovups         %%ymm0, 0x00(%[dst], %[off1])") \
257             __ASM_EMIT("vmovups         %%ymm1, 0x20(%[dst], %[off1])") \
258             __ASM_EMIT("vmovups         %%ymm2, 0x00(%[dst], %[off2])") \
259             __ASM_EMIT("vmovups         %%ymm3, 0x20(%[dst], %[off2])") \
260             __ASM_EMIT("add             $0x40, %[off1]") \
261             __ASM_EMIT("add             $0x40, %[off2]") \
262             __ASM_EMIT32("subl          $8, %[np]") \
263             __ASM_EMIT64("subq          $8, %[np]") \
264             __ASM_EMIT("jz              2f") \
265             /* Rotate angle */ \
266             __ASM_EMIT("vmovaps         0x00(%[wk]), %%ymm4")               /* xmm4 = w_re */ \
267             __ASM_EMIT("vmovaps         0x20(%[wk]), %%ymm5")               /* xmm5 = w_im */ \
268             __ASM_EMIT("vmulps          %%ymm5, %%ymm6, %%ymm2")            /* ymm2 = w_im * x_re */ \
269             __ASM_EMIT("vmulps          %%ymm5, %%ymm7, %%ymm3")            /* ymm3 = w_im * x_im */ \
270             __ASM_EMIT(FMA_SEL("vmulps  %%ymm4, %%ymm6, %%ymm6", ""))       /* ymm6 = w_re * x_re */ \
271             __ASM_EMIT(FMA_SEL("vmulps  %%ymm4, %%ymm7, %%ymm7", ""))       /* ymm7 = w_re * x_im */ \
272             __ASM_EMIT(FMA_SEL("vsubps  %%ymm3, %%ymm6, %%ymm6", "vfmsub132ps %%ymm4, %%ymm3, %%ymm6")) /* ymm6 = x_re' = w_re * x_re - w_im * x_im */ \
273             __ASM_EMIT(FMA_SEL("vaddps  %%ymm2, %%ymm7, %%ymm7", "vfmadd132ps %%ymm4, %%ymm2, %%ymm7")) /* ymm7 = x_im' = w_re * x_im + w_im * x_re */ \
274             /* Repeat loop */ \
275         __ASM_EMIT("jmp             1b") \
276         __ASM_EMIT("2:") \
277         \
278         : [off1] "+r" (off1), [off2] "+r" (off2), [np] __ASM_ARG_RW(np) \
279         : [dst] "r" (dst), [ak] "r" (ak), [wk] "r" (wk) \
280         : "cc", "memory",  \
281         "%xmm0", "%xmm1", "%xmm2", "%xmm3", \
282         "%xmm4", "%xmm5", "%xmm6", "%xmm7"  \
283     );
284 
285 #define FASTCONV_REVERSE_BUTTERFLY_BODY_LAST(add_re, add_im, FMA_SEL, IF_ADD) \
286     size_t off; \
287     float norm = 0.5f / np; \
288     ARCH_X86_ASM \
289     ( \
290         /* Prepare angle */ \
291         __ASM_EMIT("vbroadcastss    %[norm], %%ymm1")                   /* ymm1 = k */ \
292         __ASM_EMIT("lea             (,%[np], 4), %[off]")               /* off  = np * 8 */ \
293         __ASM_EMIT("vmovaps         0x00(%[ak]), %%ymm6")               /* ymm6 = x_re */ \
294         __ASM_EMIT("vmovaps         0x20(%[ak]), %%ymm7")               /* ymm7 = x_im */ \
295         __ASM_EMIT("vmovaps         0x00(%[wk]), %%ymm4")               /* xmm4 = w_re */ \
296         __ASM_EMIT("vmovaps         0x20(%[wk]), %%ymm5")               /* xmm5 = w_im */ \
297         /* Start loop */ \
298         __ASM_EMIT("1:") \
299             __ASM_EMIT("vmovups         0x00(%[src]), %%ymm0")              /* ymm0 = a_re */ \
300             __ASM_EMIT("vmovups         0x00(%[src], %[off], 2), %%ymm2")   /* ymm2 = b_re */ \
301             __ASM_EMIT("vmovups         0x20(%[src], %[off], 2), %%ymm3")   /* ymm3 = b_im */ \
302             /* Calculate complex multiplication */ \
303             __ASM_EMIT("vmulps          %%ymm7, %%ymm3, %%ymm3")            /* ymm3 = x_im * b_im */ \
304             __ASM_EMIT(FMA_SEL("vmulps  %%ymm6, %%ymm2, %%ymm2", ""))       /* ymm2 = x_re * b_re */ \
305             __ASM_EMIT(FMA_SEL(add_re "  %%ymm3, %%ymm2, %%ymm3", add_re " %%ymm6, %%ymm2, %%ymm3")) /* ymm3 = c_re = x_re * b_re +- x_im * b_im */ \
306             /* Perform butterfly */ \
307             __ASM_EMIT("vsubps          %%ymm3, %%ymm0, %%ymm2")            /* ymm2 = a_re - c_re */ \
308             __ASM_EMIT("vaddps          %%ymm3, %%ymm0, %%ymm0")            /* ymm0 = a_re + c_re */ \
309             __ASM_EMIT("vmulps          %%ymm1, %%ymm2, %%ymm2") \
310             __ASM_EMIT("vmulps          %%ymm1, %%ymm0, %%ymm0") \
311             /* Store values */ \
312             __ASM_EMIT(IF_ADD("vaddps   0x00(%[dst]), %%ymm0, %%ymm0")) \
313             __ASM_EMIT(IF_ADD("vaddps   0x00(%[dst], %[off]), %%ymm2, %%ymm2")) \
314             __ASM_EMIT("vmovups         %%ymm0, 0x00(%[dst])") \
315             __ASM_EMIT("vmovups         %%ymm2, 0x00(%[dst], %[off])") \
316             __ASM_EMIT("add             $0x40, %[src]") \
317             __ASM_EMIT("add             $0x20, %[dst]") \
318             __ASM_EMIT32("subl          $8, %[np]") \
319             __ASM_EMIT64("subq          $8, %[np]") \
320             __ASM_EMIT("jbe             2f") \
321             /* Rotate angle */ \
322             __ASM_EMIT("vmulps          %%ymm5, %%ymm6, %%ymm2")            /* ymm2 = w_im * x_re */ \
323             __ASM_EMIT("vmulps          %%ymm5, %%ymm7, %%ymm3")            /* ymm3 = w_im * x_im */ \
324             __ASM_EMIT(FMA_SEL("vmulps  %%ymm4, %%ymm6, %%ymm6", ""))       /* ymm6 = w_re * x_re */ \
325             __ASM_EMIT(FMA_SEL("vmulps  %%ymm4, %%ymm7, %%ymm7", ""))       /* ymm7 = w_re * x_im */ \
326             __ASM_EMIT(FMA_SEL("vsubps  %%ymm3, %%ymm6, %%ymm6", "vfmsub132ps %%ymm4, %%ymm3, %%ymm6")) /* ymm6 = x_re' = w_re * x_re - w_im * x_im */ \
327             __ASM_EMIT(FMA_SEL("vaddps  %%ymm2, %%ymm7, %%ymm7", "vfmadd132ps %%ymm4, %%ymm2, %%ymm7")) /* ymm7 = x_im' = w_re * x_im + w_im * x_re */ \
328             /* Repeat loop */ \
329         __ASM_EMIT("jmp             1b") \
330         __ASM_EMIT("2:") \
331         \
332         : [off] "=&r" (off), [np] __ASM_ARG_RW(np) \
333         : [dst] "r" (dst), [src] "r" (src), [ak] "r" (ak), [wk] "r" (wk), \
334           [norm] "o" (norm) \
335         : "cc", "memory",  \
336           "%xmm0", "%xmm1", "%xmm2", "%xmm3", \
337           "%xmm4", "%xmm5", "%xmm6", "%xmm7"  \
338     );
339 
340 namespace avx
341 {
342 #define FMA_OFF(a, b)       a
343 #define FMA_ON(a, b)        b
344 #define FASTCONV_SET(x)
345 #define FASTCONV_ADD(x)     x
346 
fastconv_direct_butterfly(float * dst,const float * ak,const float * wk,size_t pairs,size_t nb)347     static inline void fastconv_direct_butterfly(float *dst, const float *ak, const float *wk, size_t pairs, size_t nb)
348     {
349         size_t off1, off2, np;
350         off1        = 0;
351         size_t step = pairs << 3;
352         for (size_t i=0; i<nb; ++i)
353         {
354             off2        = off1 + step;
355             np          = pairs;
356 
357             FASTCONV_DIRECT_BUTTERFLY_BODY8("vaddps", "vsubps", FMA_OFF);
358 
359             off1        = off2;
360         }
361     }
362 
fastconv_direct_butterfly_last(float * dst,size_t nb)363     static inline void fastconv_direct_butterfly_last(float *dst, size_t nb)
364     {
365         FASTCONV_DIRECT_BUTTERFLY_LAST("vaddps", "vsubps", FMA_OFF);
366     }
367 
fastconv_direct_butterfly_fma3(float * dst,const float * ak,const float * wk,size_t pairs,size_t nb)368     static inline void fastconv_direct_butterfly_fma3(float *dst, const float *ak, const float *wk, size_t pairs, size_t nb)
369     {
370         size_t off1, off2, np;
371         off1        = 0;
372         size_t step = pairs << 3;
373         for (size_t i=0; i<nb; ++i)
374         {
375             off2        = off1 + step;
376             np          = pairs;
377 
378             FASTCONV_DIRECT_BUTTERFLY_BODY8("vfmadd132ps", "vfmsub132ps", FMA_ON);
379 
380             off1        = off2;
381         }
382     }
383 
fastconv_direct_butterfly_last_fma3(float * dst,size_t nb)384     static inline void fastconv_direct_butterfly_last_fma3(float *dst, size_t nb)
385     {
386         FASTCONV_DIRECT_BUTTERFLY_LAST("vfmadd132ps", "vfmsub132ps", FMA_ON);
387     }
388 
fastconv_reverse_butterfly(float * dst,const float * ak,const float * wk,size_t pairs,size_t nb)389     static inline void fastconv_reverse_butterfly(float *dst, const float *ak, const float *wk, size_t pairs, size_t nb)
390     {
391         size_t off1, off2, np;
392         off1        = 0;
393         size_t step = pairs << 3;
394         for (size_t i=0; i<nb; ++i)
395         {
396             off2        = off1 + step;
397             np          = pairs;
398 
399             FASTCONV_REVERSE_BUTTERFLY_BODY8("vsubps", "vaddps", FMA_OFF);
400 
401             off1        = off2;
402         }
403     }
404 
fastconv_reverse_butterfly_last(float * dst,const float * src,const float * ak,const float * wk,size_t np)405     static inline void fastconv_reverse_butterfly_last(float *dst, const float *src, const float *ak, const float *wk, size_t np)
406     {
407         FASTCONV_REVERSE_BUTTERFLY_BODY_LAST("vsubps", "vaddps", FMA_OFF, FASTCONV_SET);
408     }
409 
fastconv_reverse_butterfly_last_adding(float * dst,const float * src,const float * ak,const float * wk,size_t np)410     static inline void fastconv_reverse_butterfly_last_adding(float *dst, const float *src, const float *ak, const float *wk, size_t np)
411     {
412         FASTCONV_REVERSE_BUTTERFLY_BODY_LAST("vsubps", "vaddps", FMA_OFF, FASTCONV_ADD);
413     }
414 
fastconv_reverse_butterfly_fma3(float * dst,const float * ak,const float * wk,size_t pairs,size_t nb)415     static inline void fastconv_reverse_butterfly_fma3(float *dst, const float *ak, const float *wk, size_t pairs, size_t nb)
416     {
417         size_t off1, off2, np;
418         off1        = 0;
419         size_t step = pairs << 3;
420         for (size_t i=0; i<nb; ++i)
421         {
422             off2        = off1 + step;
423             np          = pairs;
424 
425             FASTCONV_REVERSE_BUTTERFLY_BODY8("vfmsub231ps", "vfmadd231ps", FMA_ON);
426 
427             off1        = off2;
428         }
429     }
430 
fastconv_reverse_butterfly_last_fma3(float * dst,const float * src,const float * ak,const float * wk,size_t np)431     static inline void fastconv_reverse_butterfly_last_fma3(float *dst, const float *src, const float *ak, const float *wk, size_t np)
432     {
433         FASTCONV_REVERSE_BUTTERFLY_BODY_LAST("vfmsub231ps", "vfmadd231ps", FMA_ON, FASTCONV_SET);
434     }
435 
fastconv_reverse_butterfly_last_adding_fma3(float * dst,const float * src,const float * ak,const float * wk,size_t np)436     static inline void fastconv_reverse_butterfly_last_adding_fma3(float *dst, const float *src, const float *ak, const float *wk, size_t np)
437     {
438         FASTCONV_REVERSE_BUTTERFLY_BODY_LAST("vfmsub231ps", "vfmadd231ps", FMA_ON, FASTCONV_ADD);
439     }
440 
441 #undef FASTCONV_DIRECT_BUTTERFLY_BODY8
442 #undef FASTCONV_DIRECT_BUTTERFLY_LAST
443 #undef FASTCONV_REVERSE_BUTTERFLY_BODY8
444 #undef FASTCONV_REVERSE_BUTTERFLY_LAST
445 #undef FASTCONV_SET
446 #undef FASTCONV_ADD
447 #undef FMA_OFF
448 #undef FMA_ON
449 }
450