1 /*
2  * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3  *           (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com>
4  *
5  * This file is part of lsp-plugins
6  * Created on: 9 дек. 2019 г.
7  *
8  * lsp-plugins is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * any later version.
12  *
13  * lsp-plugins is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>.
20  */
21 
22 #ifndef DSP_ARCH_X86_AVX_IMPL
23     #error "This header should not be included directly"
24 #endif /* DSP_ARCH_X86_AVX_IMPL */
25 
26 namespace avx
27 {
FFT_SCRAMBLE_SELF_DIRECT_NAME(float * dst_re,float * dst_im,size_t rank)28     static inline void FFT_SCRAMBLE_SELF_DIRECT_NAME(float *dst_re, float *dst_im, size_t rank)
29     {
30         // Calculate number of items
31         size_t items    = (1 << rank) - 1;
32 
33         for (size_t i = 1; i < items; ++i)
34         {
35             size_t j = reverse_bits(FFT_TYPE(i), rank);    /* Reverse the order of the bits */
36             if (i >= j)
37                 continue;
38 
39             /* Copy the values from the reversed position */
40             ARCH_X86_ASM
41             (
42                 __ASM_EMIT("vmovss (%[dst_re], %[i], 4), %%xmm0")
43                 __ASM_EMIT("vmovss (%[dst_im], %[i], 4), %%xmm1")
44                 __ASM_EMIT("vmovss (%[dst_re], %[j], 4), %%xmm2")
45                 __ASM_EMIT("vmovss (%[dst_im], %[j], 4), %%xmm3")
46                 __ASM_EMIT("vmovss %%xmm2, (%[dst_re], %[i], 4)")
47                 __ASM_EMIT("vmovss %%xmm3, (%[dst_im], %[i], 4)")
48                 __ASM_EMIT("vmovss %%xmm0, (%[dst_re], %[j], 4)")
49                 __ASM_EMIT("vmovss %%xmm1, (%[dst_im], %[j], 4)")
50                 :
51                 : [dst_re] "r" (dst_re), [dst_im] "r" (dst_im),
52                   [i] "r"(i), [j] "r"(j)
53                 : "memory",
54                   "%xmm0", "%xmm1", "%xmm2", "%xmm3"
55             );
56         }
57 
58         // Perform butterfly 8x
59         size_t off = 0;
60         items = 1 << (rank - 3);
61 
62         // Perform 4-element butterflies
63         ARCH_X86_ASM
64         (
65             /* Loop 2x 4-element butterflies */
66             __ASM_EMIT("sub             $2, %[items]")
67             __ASM_EMIT("jb              2f")
68             __ASM_EMIT("1:")
69                 /* Load data to registers */
70                 __ASM_EMIT("vmovups         0x00(%[dst_re], %[off]), %%xmm0")               /* xmm0 = r0 r1 r2 r3 */
71                 __ASM_EMIT("vmovups         0x10(%[dst_re], %[off]), %%xmm4")               /* xmm4 = r4 r5 r6 r7 */
72                 __ASM_EMIT("vinsertf128     $1, 0x20(%[dst_re], %[off]), %%ymm0, %%ymm0")   /* ymm0 = r0 r1 r2 r3 */
73                 __ASM_EMIT("vinsertf128     $1, 0x30(%[dst_re], %[off]), %%ymm4, %%ymm4")   /* ymm4 = r4 r5 r6 r7 */
74                 __ASM_EMIT("vmovups         0x00(%[dst_im], %[off]), %%xmm2")               /* xmm2 = i0 i1 i2 i3 */
75                 __ASM_EMIT("vmovups         0x10(%[dst_im], %[off]), %%xmm6")               /* xmm6 = i4 i5 i6 i7 */
76                 __ASM_EMIT("vinsertf128     $1, 0x20(%[dst_im], %[off]), %%ymm2, %%ymm2")   /* ymm2 = i0 i1 i2 i3 */
77                 __ASM_EMIT("vinsertf128     $1, 0x30(%[dst_im], %[off]), %%ymm6, %%ymm6")   /* ymm6 = i4 i5 i6 i7 */
78                 /* 1st-order 4x butterfly */
79                 __ASM_EMIT("vhsubps         %%ymm4, %%ymm0, %%ymm1")                /* ymm1 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' */
80                 __ASM_EMIT("vhsubps         %%ymm6, %%ymm2, %%ymm3")                /* ymm3 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' */
81                 __ASM_EMIT("vhaddps         %%ymm4, %%ymm0, %%ymm0")                /* ymm0 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' */
82                 __ASM_EMIT("vhaddps         %%ymm6, %%ymm2, %%ymm2")                /* ymm2 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' */
83                 /* 2nd-order 4x butterfly */
84                 __ASM_EMIT("vblendps        $0xaa, %%ymm3, %%ymm1, %%ymm4")         /* ymm4 = r1' i3' r5' i7' */
85                 __ASM_EMIT("vblendps        $0xaa, %%ymm1, %%ymm3, %%ymm5")         /* ymm5 = i1' r3' i5' r7' */
86                 __ASM_EMIT("vhsubps         %%ymm4, %%ymm0, %%ymm1")                /* ymm1 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r3" r7" */
87                 __ASM_EMIT("vhsubps         %%ymm5, %%ymm2, %%ymm3")                /* ymm3 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i1" i5" */
88                 __ASM_EMIT("vhaddps         %%ymm4, %%ymm0, %%ymm0")                /* ymm0 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r1" r5" */
89                 __ASM_EMIT("vhaddps         %%ymm5, %%ymm2, %%ymm2")                /* ymm2 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i3" i7" */
90                 __ASM_EMIT("vblendps        $0xcc, %%ymm3, %%ymm2, %%ymm4")         /* ymm4 = i0" i4" i1" i5" */
91                 __ASM_EMIT("vblendps        $0xcc, %%ymm2, %%ymm3, %%ymm5")         /* ymm5 = i2" i6" i3" i7" */
92                 __ASM_EMIT("vshufps         $0x88, %%ymm1, %%ymm0, %%ymm2")         /* ymm2 = r0" r1" r2" r3" */
93                 __ASM_EMIT("vshufps         $0xdd, %%ymm1, %%ymm0, %%ymm3")         /* ymm3 = r4" r5" r6" r7" */
94                 __ASM_EMIT("vshufps         $0x88, %%ymm5, %%ymm4, %%ymm6")         /* ymm6 = i0" i1" i2" i3" */
95                 __ASM_EMIT("vshufps         $0xdd, %%ymm5, %%ymm4, %%ymm7")         /* ymm7 = i4" i5" i6" i7" */
96                 /* 3rd-order 8x butterfly */
97                 __ASM_EMIT("vmulps          0x20 + %[FFT_A], %%ymm3, %%ymm4")       /* ymm4 = x_im * b_re */ \
98                 __ASM_EMIT("vmulps          0x20 + %[FFT_A], %%ymm7, %%ymm5")       /* ymm5 = x_im * b_im */ \
99                 __ASM_EMIT(FFT_FMA("vmulps  0x00 + %[FFT_A], %%ymm3, %%ymm3", ""))  /* ymm3 = x_re * b_re */ \
100                 __ASM_EMIT(FFT_FMA("vmulps  0x00 + %[FFT_A], %%ymm7, %%ymm7", ""))  /* ymm7 = x_re * b_im */ \
101                 __ASM_EMIT(FFT_FMA("vaddps  %%ymm5, %%ymm3, %%ymm5", "vfmadd231ps  0x00 + %[FFT_A], %%ymm3, %%ymm5"))       /* ymm5 = c_re = x_re * b_re + x_im * b_im */ \
102                 __ASM_EMIT(FFT_FMA("vsubps  %%ymm4, %%ymm7, %%ymm4", "vfmsub231ps  0x00 + %[FFT_A], %%ymm7, %%ymm4"))       /* ymm4 = c_im = x_re * b_im - x_im * b_re */ \
103                 __ASM_EMIT("vsubps          %%ymm5, %%ymm2, %%ymm0")                /* ymm0 = a_re - c_re */ \
104                 __ASM_EMIT("vsubps          %%ymm4, %%ymm6, %%ymm1")                /* ymm1 = a_im - c_im */ \
105                 __ASM_EMIT("vaddps          %%ymm5, %%ymm2, %%ymm2")                /* ymm2 = a_re + c_re */ \
106                 __ASM_EMIT("vaddps          %%ymm4, %%ymm6, %%ymm3")                /* ymm3 = a_im + c_im */ \
107                 /* Store */
108                 __ASM_EMIT("vmovups         %%xmm2, 0x00(%[dst_re], %[off])")
109                 __ASM_EMIT("vmovups         %%xmm0, 0x10(%[dst_re], %[off])")
110                 __ASM_EMIT("vextractf128    $1, %%ymm2, 0x20(%[dst_re], %[off])")
111                 __ASM_EMIT("vextractf128    $1, %%ymm0, 0x30(%[dst_re], %[off])")
112                 __ASM_EMIT("vmovups         %%xmm3, 0x00(%[dst_im], %[off])")
113                 __ASM_EMIT("vmovups         %%xmm1, 0x10(%[dst_im], %[off])")
114                 __ASM_EMIT("vextractf128    $1, %%ymm3, 0x20(%[dst_im], %[off])")
115                 __ASM_EMIT("vextractf128    $1, %%ymm1, 0x30(%[dst_im], %[off])")
116                 /* Move pointers and repeat*/
117                 __ASM_EMIT("add             $0x40, %[off]")
118                 __ASM_EMIT("sub             $2, %[items]")
119                 __ASM_EMIT("jae             1b")
120             __ASM_EMIT("2:")
121             /* x4 scramble block */
122             __ASM_EMIT("add             $1, %[items]")
123             __ASM_EMIT("jl              4f")
124                 __ASM_EMIT("vmovups         0x00(%[dst_re], %[off]), %%xmm0")       /* xmm0 = r0 r1 r2 r3 */
125                 __ASM_EMIT("vmovups         0x10(%[dst_re], %[off]), %%xmm4")       /* xmm4 = r4 r5 r6 r7 */
126                 __ASM_EMIT("vmovups         0x00(%[dst_im], %[off]), %%xmm2")       /* xmm2 = i0 i1 i2 i3 */
127                 __ASM_EMIT("vmovups         0x10(%[dst_im], %[off]), %%xmm6")       /* xmm6 = i4 i5 i6 i7 */
128                 /* 1st-order 4x butterfly */
129                 __ASM_EMIT("vhsubps         %%xmm4, %%xmm0, %%xmm1")                /* xmm1 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' */
130                 __ASM_EMIT("vhsubps         %%xmm6, %%xmm2, %%xmm3")                /* xmm3 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' */
131                 __ASM_EMIT("vhaddps         %%xmm4, %%xmm0, %%xmm0")                /* xmm0 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' */
132                 __ASM_EMIT("vhaddps         %%xmm6, %%xmm2, %%xmm2")                /* xmm2 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' */
133                 /* 2nd-order 4x butterfly */
134                 __ASM_EMIT("vblendps        $0xaa, %%xmm3, %%xmm1, %%xmm4")         /* xmm4 = r1' i3' r5' i7' */
135                 __ASM_EMIT("vblendps        $0xaa, %%xmm1, %%xmm3, %%xmm5")         /* xmm5 = i1' r3' i5' r7' */
136                 __ASM_EMIT("vhsubps         %%xmm4, %%xmm0, %%xmm1")                /* xmm1 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r3" r7" */
137                 __ASM_EMIT("vhsubps         %%xmm5, %%xmm2, %%xmm3")                /* xmm3 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i1" i5" */
138                 __ASM_EMIT("vhaddps         %%xmm4, %%xmm0, %%xmm0")                /* xmm0 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r1" r5" */
139                 __ASM_EMIT("vhaddps         %%xmm5, %%xmm2, %%xmm2")                /* xmm2 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i3" i7" */
140                 __ASM_EMIT("vblendps        $0xcc, %%xmm3, %%xmm2, %%xmm4")         /* xmm4 = i0" i4" i1" i5" */
141                 __ASM_EMIT("vblendps        $0xcc, %%xmm2, %%xmm3, %%xmm5")         /* xmm5 = i2" i6" i3" i7" */
142                 __ASM_EMIT("vshufps         $0x88, %%xmm1, %%xmm0, %%xmm2")         /* xmm2 = r0" r1" r2" r3" */
143                 __ASM_EMIT("vshufps         $0xdd, %%xmm1, %%xmm0, %%xmm3")         /* xmm3 = r4" r5" r6" r7" */
144                 __ASM_EMIT("vshufps         $0x88, %%xmm5, %%xmm4, %%xmm6")         /* xmm6 = i0" i1" i2" i3" */
145                 __ASM_EMIT("vshufps         $0xdd, %%xmm5, %%xmm4, %%xmm7")         /* xmm7 = i4" i5" i6" i7" */
146                 /* 3rd-order 8x butterfly */
147                 __ASM_EMIT("vmulps          0x20 + %[FFT_A], %%xmm3, %%xmm4")       /* xmm4 = x_im * b_re */ \
148                 __ASM_EMIT("vmulps          0x20 + %[FFT_A], %%xmm7, %%xmm5")       /* xmm5 = x_im * b_im */ \
149                 __ASM_EMIT(FFT_FMA("vmulps  0x00 + %[FFT_A], %%xmm3, %%xmm3", ""))  /* xmm3 = x_re * b_re */ \
150                 __ASM_EMIT(FFT_FMA("vmulps  0x00 + %[FFT_A], %%xmm7, %%xmm7", ""))  /* xmm7 = x_re * b_im */ \
151                 __ASM_EMIT(FFT_FMA("vaddps  %%xmm5, %%xmm3, %%xmm5", "vfmadd231ps 0x00 + %[FFT_A], %%xmm3, %%xmm5"))        /* xmm5 = c_re = x_re * b_re + x_im * b_im */ \
152                 __ASM_EMIT(FFT_FMA("vsubps  %%xmm4, %%xmm7, %%xmm4", "vfmsub231ps 0x00 + %[FFT_A], %%xmm7, %%xmm4"))        /* xmm4 = c_im = x_re * b_im - x_im * b_re */ \
153                 __ASM_EMIT("vsubps          %%xmm5, %%xmm2, %%xmm0")                /* xmm0 = a_re - c_re */ \
154                 __ASM_EMIT("vsubps          %%xmm4, %%xmm6, %%xmm1")                /* xmm1 = a_im - c_im */ \
155                 __ASM_EMIT("vaddps          %%xmm5, %%xmm2, %%xmm2")                /* xmm2 = a_re + c_re */ \
156                 __ASM_EMIT("vaddps          %%xmm4, %%xmm6, %%xmm3")                /* xmm3 = a_im + c_im */ \
157                 /* Store */
158                 __ASM_EMIT("vmovups         %%xmm2, 0x00(%[dst_re], %[off])")
159                 __ASM_EMIT("vmovups         %%xmm0, 0x10(%[dst_re], %[off])")
160                 __ASM_EMIT("vmovups         %%xmm3, 0x00(%[dst_im], %[off])")
161                 __ASM_EMIT("vmovups         %%xmm1, 0x10(%[dst_im], %[off])")
162             __ASM_EMIT("4:")
163 
164             : [dst_re] "+r"(dst_re), [dst_im] "+r"(dst_im),
165               [off] "+r" (off), [items] "+r"(items)
166             : [FFT_A] "o" (FFT_A)
167             : "cc", "memory",
168               "%xmm0", "%xmm1", "%xmm2", "%xmm3",
169               "%xmm4", "%xmm5", "%xmm6", "%xmm7"
170         );
171     }
172 
FFT_SCRAMBLE_SELF_REVERSE_NAME(float * dst_re,float * dst_im,size_t rank)173     static inline void FFT_SCRAMBLE_SELF_REVERSE_NAME(float *dst_re, float *dst_im, size_t rank)
174     {
175         // Calculate number of items
176         size_t items    = (1 << rank) - 1;
177 
178         for (size_t i = 1; i < items; ++i)
179         {
180             size_t j = reverse_bits(FFT_TYPE(i), rank);    /* Reverse the order of the bits */
181             if (i >= j)
182                 continue;
183 
184             /* Copy the values from the reversed position */
185             ARCH_X86_ASM
186             (
187                 __ASM_EMIT("vmovss (%[dst_re], %[i], 4), %%xmm0")
188                 __ASM_EMIT("vmovss (%[dst_im], %[i], 4), %%xmm1")
189                 __ASM_EMIT("vmovss (%[dst_re], %[j], 4), %%xmm2")
190                 __ASM_EMIT("vmovss (%[dst_im], %[j], 4), %%xmm3")
191                 __ASM_EMIT("vmovss %%xmm2, (%[dst_re], %[i], 4)")
192                 __ASM_EMIT("vmovss %%xmm3, (%[dst_im], %[i], 4)")
193                 __ASM_EMIT("vmovss %%xmm0, (%[dst_re], %[j], 4)")
194                 __ASM_EMIT("vmovss %%xmm1, (%[dst_im], %[j], 4)")
195                 :
196                 : [dst_re] "r"(dst_re), [dst_im] "r"(dst_im),
197                   [i] "r"(i), [j] "r"(j)
198                 : "memory",
199                   "%xmm0", "%xmm1", "%xmm2", "%xmm3"
200             );
201         }
202 
203         // Perform butterfly 8x
204         size_t off = 0;
205         items = 1 << (rank - 3);
206 
207         // Perform 4-element butterflies
208         ARCH_X86_ASM
209         (
210             /* Loop 2x 4-element butterflies */
211             __ASM_EMIT("sub             $2, %[items]")
212             __ASM_EMIT("jb              2f")
213             __ASM_EMIT("1:")
214                 /* Load data to registers */
215                 __ASM_EMIT("vmovups         0x00(%[dst_re], %[off]), %%xmm0")               /* xmm0 = r0 r1 r2 r3 */
216                 __ASM_EMIT("vmovups         0x10(%[dst_re], %[off]), %%xmm4")               /* xmm4 = r4 r5 r6 r7 */
217                 __ASM_EMIT("vinsertf128     $1, 0x20(%[dst_re], %[off]), %%ymm0, %%ymm0")   /* ymm0 = r0 r1 r2 r3 */
218                 __ASM_EMIT("vinsertf128     $1, 0x30(%[dst_re], %[off]), %%ymm4, %%ymm4")   /* ymm4 = r4 r5 r6 r7 */
219                 __ASM_EMIT("vmovups         0x00(%[dst_im], %[off]), %%xmm2")               /* xmm2 = i0 i1 i2 i3 */
220                 __ASM_EMIT("vmovups         0x10(%[dst_im], %[off]), %%xmm6")               /* xmm6 = i4 i5 i6 i7 */
221                 __ASM_EMIT("vinsertf128     $1, 0x20(%[dst_im], %[off]), %%ymm2, %%ymm2")   /* ymm2 = i0 i1 i2 i3 */
222                 __ASM_EMIT("vinsertf128     $1, 0x30(%[dst_im], %[off]), %%ymm6, %%ymm6")   /* ymm6 = i4 i5 i6 i7 */
223                 /* 1st-order 4x butterfly */
224                 __ASM_EMIT("vhsubps         %%ymm4, %%ymm0, %%ymm1")                /* ymm1 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' */
225                 __ASM_EMIT("vhsubps         %%ymm6, %%ymm2, %%ymm3")                /* ymm3 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' */
226                 __ASM_EMIT("vhaddps         %%ymm4, %%ymm0, %%ymm0")                /* ymm0 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' */
227                 __ASM_EMIT("vhaddps         %%ymm6, %%ymm2, %%ymm2")                /* ymm2 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' */
228                 /* 2nd-order 4x butterfly */
229                 __ASM_EMIT("vblendps        $0xaa, %%ymm3, %%ymm1, %%ymm4")         /* ymm4 = r1' i3' r5' i7' */
230                 __ASM_EMIT("vblendps        $0xaa, %%ymm1, %%ymm3, %%ymm5")         /* ymm5 = i1' r3' i5' r7' */
231                 __ASM_EMIT("vhsubps         %%ymm4, %%ymm0, %%ymm1")                /* ymm1 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r1" r5" */
232                 __ASM_EMIT("vhsubps         %%ymm5, %%ymm2, %%ymm3")                /* ymm3 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i3" i7" */
233                 __ASM_EMIT("vhaddps         %%ymm4, %%ymm0, %%ymm0")                /* ymm0 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r3" r7" */
234                 __ASM_EMIT("vhaddps         %%ymm5, %%ymm2, %%ymm2")                /* ymm2 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i1" i5" */
235                 __ASM_EMIT("vblendps        $0xcc, %%ymm1, %%ymm0, %%ymm4")         /* ymm4 = r0" i4" r1" r5" */
236                 __ASM_EMIT("vblendps        $0xcc, %%ymm0, %%ymm1, %%ymm5")         /* ymm5 = r2" r6" r3" r7" */
237                 __ASM_EMIT("vshufps         $0x88, %%ymm3, %%ymm2, %%ymm6")         /* ymm6 = i0" i1" i2" i3" */
238                 __ASM_EMIT("vshufps         $0xdd, %%ymm3, %%ymm2, %%ymm7")         /* ymm7 = i4" i5" i6" i7" */
239                 __ASM_EMIT("vshufps         $0x88, %%ymm5, %%ymm4, %%ymm2")         /* ymm2 = r0" r1" r2" r3" */
240                 __ASM_EMIT("vshufps         $0xdd, %%ymm5, %%ymm4, %%ymm3")         /* ymm3 = r4" r5" r6" r7" */
241                 /* 3rd-order 8x butterfly */
242                 __ASM_EMIT("vmulps          0x20 + %[FFT_A], %%ymm3, %%ymm4")       /* ymm4 = x_im * b_re */ \
243                 __ASM_EMIT("vmulps          0x20 + %[FFT_A], %%ymm7, %%ymm5")       /* ymm5 = x_im * b_im */ \
244                 __ASM_EMIT(FFT_FMA("vmulps  0x00 + %[FFT_A], %%ymm3, %%ymm3", ""))  /* ymm3 = x_re * b_re */ \
245                 __ASM_EMIT(FFT_FMA("vmulps  0x00 + %[FFT_A], %%ymm7, %%ymm7", ""))  /* ymm7 = x_re * b_im */ \
246                 __ASM_EMIT(FFT_FMA("vsubps  %%ymm5, %%ymm3, %%ymm5", "vfmsub231ps  0x00 + %[FFT_A], %%ymm3, %%ymm5"))       /* ymm5 = c_re = x_re * b_re - x_im * b_im */ \
247                 __ASM_EMIT(FFT_FMA("vaddps  %%ymm4, %%ymm7, %%ymm4", "vfmadd231ps  0x00 + %[FFT_A], %%ymm7, %%ymm4"))       /* ymm4 = c_im = x_re * b_im + x_im * b_re */ \
248                 __ASM_EMIT("vsubps          %%ymm5, %%ymm2, %%ymm0")                /* ymm0 = a_re - c_re */ \
249                 __ASM_EMIT("vsubps          %%ymm4, %%ymm6, %%ymm1")                /* ymm1 = a_im - c_im */ \
250                 __ASM_EMIT("vaddps          %%ymm5, %%ymm2, %%ymm2")                /* ymm2 = a_re + c_re */ \
251                 __ASM_EMIT("vaddps          %%ymm4, %%ymm6, %%ymm3")                /* ymm3 = a_im + c_im */ \
252                 /* Store */
253                 __ASM_EMIT("vmovups         %%xmm2, 0x00(%[dst_re], %[off])")
254                 __ASM_EMIT("vmovups         %%xmm0, 0x10(%[dst_re], %[off])")
255                 __ASM_EMIT("vextractf128    $1, %%ymm2, 0x20(%[dst_re], %[off])")
256                 __ASM_EMIT("vextractf128    $1, %%ymm0, 0x30(%[dst_re], %[off])")
257                 __ASM_EMIT("vmovups         %%xmm3, 0x00(%[dst_im], %[off])")
258                 __ASM_EMIT("vmovups         %%xmm1, 0x10(%[dst_im], %[off])")
259                 __ASM_EMIT("vextractf128    $1, %%ymm3, 0x20(%[dst_im], %[off])")
260                 __ASM_EMIT("vextractf128    $1, %%ymm1, 0x30(%[dst_im], %[off])")
261                 /* Move pointers and repeat*/
262                 __ASM_EMIT("add             $0x40, %[off]")
263                 __ASM_EMIT("sub             $2, %[items]")
264                 __ASM_EMIT("jae             1b")
265             __ASM_EMIT("2:")
266             /* x4 scramble block */
267             __ASM_EMIT("add             $1, %[items]")
268             __ASM_EMIT("jl              4f")
269                 __ASM_EMIT("vmovups         0x00(%[dst_re], %[off]), %%xmm0")       /* xmm0 = r0 r1 r2 r3 */
270                 __ASM_EMIT("vmovups         0x10(%[dst_re], %[off]), %%xmm4")       /* xmm4 = r4 r5 r6 r7 */
271                 __ASM_EMIT("vmovups         0x00(%[dst_im], %[off]), %%xmm2")       /* xmm2 = i0 i1 i2 i3 */
272                 __ASM_EMIT("vmovups         0x10(%[dst_im], %[off]), %%xmm6")       /* xmm6 = i4 i5 i6 i7 */
273                 /* 1st-order 4x butterfly */
274                 __ASM_EMIT("vhsubps         %%xmm4, %%xmm0, %%xmm1")                /* xmm1 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' */
275                 __ASM_EMIT("vhsubps         %%xmm6, %%xmm2, %%xmm3")                /* xmm3 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' */
276                 __ASM_EMIT("vhaddps         %%xmm4, %%xmm0, %%xmm0")                /* xmm0 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' */
277                 __ASM_EMIT("vhaddps         %%xmm6, %%xmm2, %%xmm2")                /* xmm2 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' */
278                 /* 2nd-order 4x butterfly */
279                 __ASM_EMIT("vblendps        $0xaa, %%xmm3, %%xmm1, %%xmm4")         /* xmm4 = r1' i3' r5' i7' */
280                 __ASM_EMIT("vblendps        $0xaa, %%xmm1, %%xmm3, %%xmm5")         /* xmm5 = i1' r3' i5' r7' */
281                 __ASM_EMIT("vhsubps         %%xmm4, %%xmm0, %%xmm1")                /* xmm1 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r1" r5" */
282                 __ASM_EMIT("vhsubps         %%xmm5, %%xmm2, %%xmm3")                /* xmm3 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i3" i7" */
283                 __ASM_EMIT("vhaddps         %%xmm4, %%xmm0, %%xmm0")                /* xmm0 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r3" r7" */
284                 __ASM_EMIT("vhaddps         %%xmm5, %%xmm2, %%xmm2")                /* xmm2 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i1" i5" */
285                 __ASM_EMIT("vblendps        $0xcc, %%xmm1, %%xmm0, %%xmm4")         /* xmm4 = r0" i4" r1" r5" */
286                 __ASM_EMIT("vblendps        $0xcc, %%xmm0, %%xmm1, %%xmm5")         /* xmm5 = r2" r6" r3" r7" */
287                 __ASM_EMIT("vshufps         $0x88, %%xmm3, %%xmm2, %%xmm6")         /* xmm6 = i0" i1" i2" i3" */
288                 __ASM_EMIT("vshufps         $0xdd, %%xmm3, %%xmm2, %%xmm7")         /* xmm7 = i4" i5" i6" i7" */
289                 __ASM_EMIT("vshufps         $0x88, %%xmm5, %%xmm4, %%xmm2")         /* xmm2 = r0" r1" r2" r3" */
290                 __ASM_EMIT("vshufps         $0xdd, %%xmm5, %%xmm4, %%xmm3")         /* xmm3 = r4" r5" r6" r7" */
291                 /* 3rd-order 8x butterfly */
292                 __ASM_EMIT("vmulps          0x20 + %[FFT_A], %%xmm3, %%xmm4")       /* xmm4 = x_im * b_re */ \
293                 __ASM_EMIT("vmulps          0x20 + %[FFT_A], %%xmm7, %%xmm5")       /* xmm5 = x_im * b_im */ \
294                 __ASM_EMIT(FFT_FMA("vmulps  0x00 + %[FFT_A], %%xmm3, %%xmm3", ""))  /* xmm3 = x_re * b_re */ \
295                 __ASM_EMIT(FFT_FMA("vmulps  0x00 + %[FFT_A], %%xmm7, %%xmm7", ""))  /* xmm7 = x_re * b_im */ \
296                 __ASM_EMIT(FFT_FMA("vsubps  %%xmm5, %%xmm3, %%xmm5", "vfmsub231ps  0x00 + %[FFT_A], %%xmm3, %%xmm5"))       /* xmm5 = c_re = x_re * b_re - x_im * b_im */ \
297                 __ASM_EMIT(FFT_FMA("vaddps  %%xmm4, %%xmm7, %%xmm4", "vfmadd231ps  0x00 + %[FFT_A], %%xmm7, %%xmm4"))       /* xmm4 = c_im = x_re * b_im + x_im * b_re */ \
298                 __ASM_EMIT("vsubps          %%xmm5, %%xmm2, %%xmm0")                /* xmm0 = a_re - c_re */ \
299                 __ASM_EMIT("vsubps          %%xmm4, %%xmm6, %%xmm1")                /* xmm1 = a_im - c_im */ \
300                 __ASM_EMIT("vaddps          %%xmm5, %%xmm2, %%xmm2")                /* xmm2 = a_re + c_re */ \
301                 __ASM_EMIT("vaddps          %%xmm4, %%xmm6, %%xmm3")                /* xmm3 = a_im + c_im */ \
302                 /* Store */
303                 __ASM_EMIT("vmovups         %%xmm2, 0x00(%[dst_re], %[off])")
304                 __ASM_EMIT("vmovups         %%xmm0, 0x10(%[dst_re], %[off])")
305                 __ASM_EMIT("vmovups         %%xmm3, 0x00(%[dst_im], %[off])")
306                 __ASM_EMIT("vmovups         %%xmm1, 0x10(%[dst_im], %[off])")
307             __ASM_EMIT("4:")
308 
309             : [dst_re] "+r"(dst_re), [dst_im] "+r"(dst_im),
310               [off] "+r" (off), [items] "+r"(items)
311             : [FFT_A] "o" (FFT_A)
312             : "cc", "memory",
313               "%xmm0", "%xmm1", "%xmm2", "%xmm3",
314               "%xmm4", "%xmm5", "%xmm6", "%xmm7"
315         );
316     }
317 
FFT_SCRAMBLE_COPY_DIRECT_NAME(float * dst_re,float * dst_im,const float * src_re,const float * src_im,size_t rank)318     static inline void FFT_SCRAMBLE_COPY_DIRECT_NAME(float *dst_re, float *dst_im, const float *src_re, const float *src_im, size_t rank)
319     {
320         size_t regs     = 1 << rank;
321 
322         for (size_t i=0; i<regs; ++i)
323         {
324             size_t index    = reverse_bits(FFT_TYPE(i), rank);
325 
326             ARCH_X86_ASM
327             (
328                 /* Load scalar values */
329                 __ASM_EMIT("vinsertps       $0x00, (%[src_re], %[index], 4), %%xmm0, %%xmm0")       /* xmm0 = r0  x x x         */
330                 __ASM_EMIT("vinsertps       $0x00, (%[src_im], %[index], 4), %%xmm2, %%xmm2")       /* xmm2 = i0  x x x         */
331                 __ASM_EMIT("add             %[regs], %[index]")
332                 __ASM_EMIT("vinsertps       $0x00, (%[src_re], %[index], 4), %%xmm1, %%xmm1")       /* xmm1 = r8  x x x         */
333                 __ASM_EMIT("vinsertps       $0x00, (%[src_im], %[index], 4), %%xmm3, %%xmm3")       /* xmm3 = i8  x x x         */
334                 __ASM_EMIT("add             %[regs], %[index]")
335                 __ASM_EMIT("vinsertps       $0x00, (%[src_re], %[index], 4), %%xmm4, %%xmm4")       /* xmm4 = r4  x x x         */
336                 __ASM_EMIT("vinsertps       $0x00, (%[src_im], %[index], 4), %%xmm6, %%xmm6")       /* xmm6 = i4  x x x         */
337                 __ASM_EMIT("add             %[regs], %[index]")
338                 __ASM_EMIT("vinsertps       $0x00, (%[src_re], %[index], 4), %%xmm5, %%xmm5")       /* xmm5 = r12 x x x         */
339                 __ASM_EMIT("vinsertps       $0x00, (%[src_im], %[index], 4), %%xmm7, %%xmm7")       /* xmm7 = i12 x x x         */
340                 __ASM_EMIT("add             %[regs], %[index]")
341 
342                 __ASM_EMIT("vinsertps       $0x20, (%[src_re], %[index], 4), %%xmm0, %%xmm0")       /* xmm0 = r0  x r2  x       */
343                 __ASM_EMIT("vinsertps       $0x20, (%[src_im], %[index], 4), %%xmm2, %%xmm2")       /* xmm2 = i0  x i2  x       */
344                 __ASM_EMIT("add             %[regs], %[index]")
345                 __ASM_EMIT("vinsertps       $0x20, (%[src_re], %[index], 4), %%xmm1, %%xmm1")       /* xmm1 = r8  x r10 x       */
346                 __ASM_EMIT("vinsertps       $0x20, (%[src_im], %[index], 4), %%xmm3, %%xmm3")       /* xmm3 = i8  x i10 x       */
347                 __ASM_EMIT("add             %[regs], %[index]")
348                 __ASM_EMIT("vinsertps       $0x20, (%[src_re], %[index], 4), %%xmm4, %%xmm4")       /* xmm4 = r4  x r6  x       */
349                 __ASM_EMIT("vinsertps       $0x20, (%[src_im], %[index], 4), %%xmm6, %%xmm6")       /* xmm6 = i4  x i6  x       */
350                 __ASM_EMIT("add             %[regs], %[index]")
351                 __ASM_EMIT("vinsertps       $0x20, (%[src_re], %[index], 4), %%xmm5, %%xmm5")       /* xmm5 = r12 x r14 x       */
352                 __ASM_EMIT("vinsertps       $0x20, (%[src_im], %[index], 4), %%xmm7, %%xmm7")       /* xmm7 = i12 x i14 x       */
353                 __ASM_EMIT("add             %[regs], %[index]")
354 
355                 __ASM_EMIT("vinsertps       $0x10, (%[src_re], %[index], 4), %%xmm0, %%xmm0")       /* xmm0 = r0  r1  r2  x     */
356                 __ASM_EMIT("vinsertps       $0x10, (%[src_im], %[index], 4), %%xmm2, %%xmm2")       /* xmm2 = i0  i1  i2  x     */
357                 __ASM_EMIT("add             %[regs], %[index]")
358                 __ASM_EMIT("vinsertps       $0x10, (%[src_re], %[index], 4), %%xmm1, %%xmm1")       /* xmm1 = r8  r9  r10 x     */
359                 __ASM_EMIT("vinsertps       $0x10, (%[src_im], %[index], 4), %%xmm3, %%xmm3")       /* xmm3 = i8  i9  i10 x     */
360                 __ASM_EMIT("add             %[regs], %[index]")
361                 __ASM_EMIT("vinsertps       $0x10, (%[src_re], %[index], 4), %%xmm4, %%xmm4")       /* xmm4 = r4  r5  r6  x     */
362                 __ASM_EMIT("vinsertps       $0x10, (%[src_im], %[index], 4), %%xmm6, %%xmm6")       /* xmm6 = i4  i5  i6  x     */
363                 __ASM_EMIT("add             %[regs], %[index]")
364                 __ASM_EMIT("vinsertps       $0x10, (%[src_re], %[index], 4), %%xmm5, %%xmm5")       /* xmm5 = r12 r13 r14 x     */
365                 __ASM_EMIT("vinsertps       $0x10, (%[src_im], %[index], 4), %%xmm7, %%xmm7")       /* xmm7 = i12 i13 i14 x     */
366                 __ASM_EMIT("add             %[regs], %[index]")
367 
368                 __ASM_EMIT("vinsertps       $0x30, (%[src_re], %[index], 4), %%xmm0, %%xmm0")       /* xmm0 = r0  r1  r2  r3    */
369                 __ASM_EMIT("vinsertps       $0x30, (%[src_im], %[index], 4), %%xmm2, %%xmm2")       /* xmm2 = i0  i1  i2  i3    */
370                 __ASM_EMIT("add             %[regs], %[index]")
371                 __ASM_EMIT("vinsertps       $0x30, (%[src_re], %[index], 4), %%xmm1, %%xmm1")       /* xmm1 = r8  r9  r10 r11   */
372                 __ASM_EMIT("vinsertps       $0x30, (%[src_im], %[index], 4), %%xmm3, %%xmm3")       /* xmm3 = i8  i9  i10 i11   */
373                 __ASM_EMIT("add             %[regs], %[index]")
374                 __ASM_EMIT("vinsertps       $0x30, (%[src_re], %[index], 4), %%xmm4, %%xmm4")       /* xmm4 = r4  r5  r6  r7    */
375                 __ASM_EMIT("vinsertps       $0x30, (%[src_im], %[index], 4), %%xmm6, %%xmm6")       /* xmm6 = i4  i5  i6  i7    */
376                 __ASM_EMIT("add             %[regs], %[index]")
377                 __ASM_EMIT("vinsertps       $0x30, (%[src_re], %[index], 4), %%xmm5, %%xmm5")       /* xmm5 = r12 r13 r14 r15   */
378                 __ASM_EMIT("vinsertps       $0x30, (%[src_im], %[index], 4), %%xmm7, %%xmm7")       /* xmm7 = i12 i13 i14 i15   */
379                 __ASM_EMIT("add             %[regs], %[index]")
380 
381                 __ASM_EMIT("vinsertf128     $1, %%xmm1, %%ymm0, %%ymm0")                    /* ymm0 = r0 r1 r2 r3 ...   */
382                 __ASM_EMIT("vinsertf128     $1, %%xmm3, %%ymm2, %%ymm2")                    /* ymm2 = i0 i1 i2 i3 ...   */
383                 __ASM_EMIT("vinsertf128     $1, %%xmm5, %%ymm4, %%ymm4")                    /* ymm4 = r4 r5 r6 r7 ...   */
384                 __ASM_EMIT("vinsertf128     $1, %%xmm7, %%ymm6, %%ymm6")                    /* ymm0 = i4 i5 i6 i7 ...   */
385                 /* 1st-order 4x butterfly */
386                 __ASM_EMIT("vhsubps         %%ymm4, %%ymm0, %%ymm1")                /* ymm1 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' */
387                 __ASM_EMIT("vhsubps         %%ymm6, %%ymm2, %%ymm3")                /* ymm3 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' */
388                 __ASM_EMIT("vhaddps         %%ymm4, %%ymm0, %%ymm0")                /* ymm0 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' */
389                 __ASM_EMIT("vhaddps         %%ymm6, %%ymm2, %%ymm2")                /* ymm2 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' */
390                 /* 2nd-order 4x butterfly */
391                 __ASM_EMIT("vblendps        $0xaa, %%ymm3, %%ymm1, %%ymm4")         /* ymm4 = r1' i3' r5' i7' */
392                 __ASM_EMIT("vblendps        $0xaa, %%ymm1, %%ymm3, %%ymm5")         /* ymm5 = i1' r3' i5' r7' */
393                 __ASM_EMIT("vhsubps         %%ymm4, %%ymm0, %%ymm1")                /* ymm1 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r3" r7" */
394                 __ASM_EMIT("vhsubps         %%ymm5, %%ymm2, %%ymm3")                /* ymm3 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i1" i5" */
395                 __ASM_EMIT("vhaddps         %%ymm4, %%ymm0, %%ymm0")                /* ymm0 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r1" r5" */
396                 __ASM_EMIT("vhaddps         %%ymm5, %%ymm2, %%ymm2")                /* ymm2 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i3" i7" */
397                 __ASM_EMIT("vblendps        $0xcc, %%ymm3, %%ymm2, %%ymm4")         /* ymm4 = i0" i4" i1" i5" */
398                 __ASM_EMIT("vblendps        $0xcc, %%ymm2, %%ymm3, %%ymm5")         /* ymm5 = i2" i6" i3" i7" */
399                 __ASM_EMIT("vshufps         $0x88, %%ymm1, %%ymm0, %%ymm2")         /* ymm2 = r0" r1" r2" r3" */
400                 __ASM_EMIT("vshufps         $0xdd, %%ymm1, %%ymm0, %%ymm3")         /* ymm3 = r4" r5" r6" r7" */
401                 __ASM_EMIT("vshufps         $0x88, %%ymm5, %%ymm4, %%ymm6")         /* ymm6 = i0" i1" i2" i3" */
402                 __ASM_EMIT("vshufps         $0xdd, %%ymm5, %%ymm4, %%ymm7")         /* ymm7 = i4" i5" i6" i7" */
403                 /* 3rd-order 8x butterfly */
404                 __ASM_EMIT("vmulps          0x20 + %[FFT_A], %%ymm3, %%ymm4")       /* ymm4 = x_im * b_re */ \
405                 __ASM_EMIT("vmulps          0x20 + %[FFT_A], %%ymm7, %%ymm5")       /* ymm5 = x_im * b_im */ \
406                 __ASM_EMIT(FFT_FMA("vmulps  0x00 + %[FFT_A], %%ymm3, %%ymm3", ""))  /* ymm3 = x_re * b_re */ \
407                 __ASM_EMIT(FFT_FMA("vmulps  0x00 + %[FFT_A], %%ymm7, %%ymm7", ""))  /* ymm7 = x_re * b_im */ \
408                 __ASM_EMIT(FFT_FMA("vaddps  %%ymm5, %%ymm3, %%ymm5", "vfmadd231ps  0x00 + %[FFT_A], %%ymm3, %%ymm5"))       /* ymm5 = c_re = x_re * b_re + x_im * b_im */ \
409                 __ASM_EMIT(FFT_FMA("vsubps  %%ymm4, %%ymm7, %%ymm4", "vfmsub231ps  0x00 + %[FFT_A], %%ymm7, %%ymm4"))       /* ymm4 = c_im = x_re * b_im - x_im * b_re */ \
410                 __ASM_EMIT("vsubps          %%ymm5, %%ymm2, %%ymm0")                /* ymm0 = a_re - c_re */ \
411                 __ASM_EMIT("vsubps          %%ymm4, %%ymm6, %%ymm1")                /* ymm1 = a_im - c_im */ \
412                 __ASM_EMIT("vaddps          %%ymm5, %%ymm2, %%ymm2")                /* ymm2 = a_re + c_re */ \
413                 __ASM_EMIT("vaddps          %%ymm4, %%ymm6, %%ymm3")                /* ymm3 = a_im + c_im */ \
414                 /* Store */
415                 __ASM_EMIT("vmovups         %%xmm2, 0x00(%[dst_re])")
416                 __ASM_EMIT("vmovups         %%xmm0, 0x10(%[dst_re])")
417                 __ASM_EMIT("vextractf128    $1, %%ymm2, 0x20(%[dst_re])")
418                 __ASM_EMIT("vextractf128    $1, %%ymm0, 0x30(%[dst_re])")
419                 __ASM_EMIT("vmovups         %%xmm3, 0x00(%[dst_im])")
420                 __ASM_EMIT("vmovups         %%xmm1, 0x10(%[dst_im])")
421                 __ASM_EMIT("vextractf128    $1, %%ymm3, 0x20(%[dst_im])")
422                 __ASM_EMIT("vextractf128    $1, %%ymm1, 0x30(%[dst_im])")
423                 __ASM_EMIT("add             $0x40, %[dst_re]")
424                 __ASM_EMIT("add             $0x40, %[dst_im]")
425 
426                 : [dst_re] "+r" (dst_re), [dst_im] "+r"(dst_im), [index] "+r"(index)
427                 : [src_re] "r" (src_re), [src_im] "r"(src_im), [regs] __ASM_ARG_RO(regs),
428                   [FFT_A] "o" (FFT_A)
429                 : "cc", "memory",
430                   "%xmm0", "%xmm1", "%xmm2", "%xmm3",
431                   "%xmm4", "%xmm5", "%xmm6", "%xmm7"
432             );
433         }
434     }
435 
FFT_SCRAMBLE_COPY_REVERSE_NAME(float * dst_re,float * dst_im,const float * src_re,const float * src_im,size_t rank)436     static inline void FFT_SCRAMBLE_COPY_REVERSE_NAME(float *dst_re, float *dst_im, const float *src_re, const float *src_im, size_t rank)
437     {
438         size_t regs     = 1 << rank;
439 
440         for (size_t i=0; i<regs; ++i)
441         {
442             size_t index    = reverse_bits(FFT_TYPE(i), rank);
443 
444             ARCH_X86_ASM
445             (
446                 /* Load scalar values */
447                 __ASM_EMIT("vinsertps       $0x00, (%[src_re], %[index], 4), %%xmm0, %%xmm0")       /* xmm0 = r0  x x x         */
448                 __ASM_EMIT("vinsertps       $0x00, (%[src_im], %[index], 4), %%xmm2, %%xmm2")       /* xmm2 = i0  x x x         */
449                 __ASM_EMIT("add             %[regs], %[index]")
450                 __ASM_EMIT("vinsertps       $0x00, (%[src_re], %[index], 4), %%xmm1, %%xmm1")       /* xmm1 = r8  x x x         */
451                 __ASM_EMIT("vinsertps       $0x00, (%[src_im], %[index], 4), %%xmm3, %%xmm3")       /* xmm3 = i8  x x x         */
452                 __ASM_EMIT("add             %[regs], %[index]")
453                 __ASM_EMIT("vinsertps       $0x00, (%[src_re], %[index], 4), %%xmm4, %%xmm4")       /* xmm4 = r4  x x x         */
454                 __ASM_EMIT("vinsertps       $0x00, (%[src_im], %[index], 4), %%xmm6, %%xmm6")       /* xmm6 = i4  x x x         */
455                 __ASM_EMIT("add             %[regs], %[index]")
456                 __ASM_EMIT("vinsertps       $0x00, (%[src_re], %[index], 4), %%xmm5, %%xmm5")       /* xmm5 = r12 x x x         */
457                 __ASM_EMIT("vinsertps       $0x00, (%[src_im], %[index], 4), %%xmm7, %%xmm7")       /* xmm7 = i12 x x x         */
458                 __ASM_EMIT("add             %[regs], %[index]")
459 
460                 __ASM_EMIT("vinsertps       $0x20, (%[src_re], %[index], 4), %%xmm0, %%xmm0")       /* xmm0 = r0  x r2  x       */
461                 __ASM_EMIT("vinsertps       $0x20, (%[src_im], %[index], 4), %%xmm2, %%xmm2")       /* xmm2 = i0  x i2  x       */
462                 __ASM_EMIT("add             %[regs], %[index]")
463                 __ASM_EMIT("vinsertps       $0x20, (%[src_re], %[index], 4), %%xmm1, %%xmm1")       /* xmm1 = r8  x r10 x       */
464                 __ASM_EMIT("vinsertps       $0x20, (%[src_im], %[index], 4), %%xmm3, %%xmm3")       /* xmm3 = i8  x i10 x       */
465                 __ASM_EMIT("add             %[regs], %[index]")
466                 __ASM_EMIT("vinsertps       $0x20, (%[src_re], %[index], 4), %%xmm4, %%xmm4")       /* xmm4 = r4  x r6  x       */
467                 __ASM_EMIT("vinsertps       $0x20, (%[src_im], %[index], 4), %%xmm6, %%xmm6")       /* xmm6 = i4  x i6  x       */
468                 __ASM_EMIT("add             %[regs], %[index]")
469                 __ASM_EMIT("vinsertps       $0x20, (%[src_re], %[index], 4), %%xmm5, %%xmm5")       /* xmm5 = r12 x r14 x       */
470                 __ASM_EMIT("vinsertps       $0x20, (%[src_im], %[index], 4), %%xmm7, %%xmm7")       /* xmm7 = i12 x i14 x       */
471                 __ASM_EMIT("add             %[regs], %[index]")
472 
473                 __ASM_EMIT("vinsertps       $0x10, (%[src_re], %[index], 4), %%xmm0, %%xmm0")       /* xmm0 = r0  r1  r2  x     */
474                 __ASM_EMIT("vinsertps       $0x10, (%[src_im], %[index], 4), %%xmm2, %%xmm2")       /* xmm2 = i0  i1  i2  x     */
475                 __ASM_EMIT("add             %[regs], %[index]")
476                 __ASM_EMIT("vinsertps       $0x10, (%[src_re], %[index], 4), %%xmm1, %%xmm1")       /* xmm1 = r8  r9  r10 x     */
477                 __ASM_EMIT("vinsertps       $0x10, (%[src_im], %[index], 4), %%xmm3, %%xmm3")       /* xmm3 = i8  i9  i10 x     */
478                 __ASM_EMIT("add             %[regs], %[index]")
479                 __ASM_EMIT("vinsertps       $0x10, (%[src_re], %[index], 4), %%xmm4, %%xmm4")       /* xmm4 = r4  r5  r6  x     */
480                 __ASM_EMIT("vinsertps       $0x10, (%[src_im], %[index], 4), %%xmm6, %%xmm6")       /* xmm6 = i4  i5  i6  x     */
481                 __ASM_EMIT("add             %[regs], %[index]")
482                 __ASM_EMIT("vinsertps       $0x10, (%[src_re], %[index], 4), %%xmm5, %%xmm5")       /* xmm5 = r12 r13 r14 x     */
483                 __ASM_EMIT("vinsertps       $0x10, (%[src_im], %[index], 4), %%xmm7, %%xmm7")       /* xmm7 = i12 i13 i14 x     */
484                 __ASM_EMIT("add             %[regs], %[index]")
485 
486                 __ASM_EMIT("vinsertps       $0x30, (%[src_re], %[index], 4), %%xmm0, %%xmm0")       /* xmm0 = r0  r1  r2  r3    */
487                 __ASM_EMIT("vinsertps       $0x30, (%[src_im], %[index], 4), %%xmm2, %%xmm2")       /* xmm2 = i0  i1  i2  i3    */
488                 __ASM_EMIT("add             %[regs], %[index]")
489                 __ASM_EMIT("vinsertps       $0x30, (%[src_re], %[index], 4), %%xmm1, %%xmm1")       /* xmm1 = r8  r9  r10 r11   */
490                 __ASM_EMIT("vinsertps       $0x30, (%[src_im], %[index], 4), %%xmm3, %%xmm3")       /* xmm3 = i8  i9  i10 i11   */
491                 __ASM_EMIT("add             %[regs], %[index]")
492                 __ASM_EMIT("vinsertps       $0x30, (%[src_re], %[index], 4), %%xmm4, %%xmm4")       /* xmm4 = r4  r5  r6  r7    */
493                 __ASM_EMIT("vinsertps       $0x30, (%[src_im], %[index], 4), %%xmm6, %%xmm6")       /* xmm6 = i4  i5  i6  i7    */
494                 __ASM_EMIT("add             %[regs], %[index]")
495                 __ASM_EMIT("vinsertps       $0x30, (%[src_re], %[index], 4), %%xmm5, %%xmm5")       /* xmm5 = r12 r13 r14 r15   */
496                 __ASM_EMIT("vinsertps       $0x30, (%[src_im], %[index], 4), %%xmm7, %%xmm7")       /* xmm7 = i12 i13 i14 i15   */
497                 __ASM_EMIT("add             %[regs], %[index]")
498 
499                 __ASM_EMIT("vinsertf128     $1, %%xmm1, %%ymm0, %%ymm0")                    /* ymm0 = r0 r1 r2 r3 ...   */
500                 __ASM_EMIT("vinsertf128     $1, %%xmm3, %%ymm2, %%ymm2")                    /* ymm2 = i0 i1 i2 i3 ...   */
501                 __ASM_EMIT("vinsertf128     $1, %%xmm5, %%ymm4, %%ymm4")                    /* ymm4 = r4 r5 r6 r7 ...   */
502                 __ASM_EMIT("vinsertf128     $1, %%xmm7, %%ymm6, %%ymm6")                    /* ymm0 = i4 i5 i6 i7 ...   */
503                 /* 1st-order 4x butterfly */
504                 __ASM_EMIT("vhsubps         %%ymm4, %%ymm0, %%ymm1")                /* ymm1 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' */
505                 __ASM_EMIT("vhsubps         %%ymm6, %%ymm2, %%ymm3")                /* ymm3 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' */
506                 __ASM_EMIT("vhaddps         %%ymm4, %%ymm0, %%ymm0")                /* ymm0 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' */
507                 __ASM_EMIT("vhaddps         %%ymm6, %%ymm2, %%ymm2")                /* ymm2 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' */
508                 /* 2nd-order 4x butterfly */
509                 __ASM_EMIT("vblendps        $0xaa, %%ymm3, %%ymm1, %%ymm4")         /* ymm4 = r1' i3' r5' i7' */
510                 __ASM_EMIT("vblendps        $0xaa, %%ymm1, %%ymm3, %%ymm5")         /* ymm5 = i1' r3' i5' r7' */
511                 __ASM_EMIT("vhsubps         %%ymm4, %%ymm0, %%ymm1")                /* ymm1 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r1" r5" */
512                 __ASM_EMIT("vhsubps         %%ymm5, %%ymm2, %%ymm3")                /* ymm3 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i3" i7" */
513                 __ASM_EMIT("vhaddps         %%ymm4, %%ymm0, %%ymm0")                /* ymm0 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r3" r7" */
514                 __ASM_EMIT("vhaddps         %%ymm5, %%ymm2, %%ymm2")                /* ymm2 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i1" i5" */
515                 __ASM_EMIT("vblendps        $0xcc, %%ymm1, %%ymm0, %%ymm4")         /* ymm4 = r0" i4" r1" r5" */
516                 __ASM_EMIT("vblendps        $0xcc, %%ymm0, %%ymm1, %%ymm5")         /* ymm5 = r2" r6" r3" r7" */
517                 __ASM_EMIT("vshufps         $0x88, %%ymm3, %%ymm2, %%ymm6")         /* ymm6 = i0" i1" i2" i3" */
518                 __ASM_EMIT("vshufps         $0xdd, %%ymm3, %%ymm2, %%ymm7")         /* ymm7 = i4" i5" i6" i7" */
519                 __ASM_EMIT("vshufps         $0x88, %%ymm5, %%ymm4, %%ymm2")         /* ymm2 = r0" r1" r2" r3" */
520                 __ASM_EMIT("vshufps         $0xdd, %%ymm5, %%ymm4, %%ymm3")         /* ymm3 = r4" r5" r6" r7" */
521                 /* 3rd-order 8x butterfly */
522                 __ASM_EMIT("vmulps          0x20 + %[FFT_A], %%ymm3, %%ymm4")       /* ymm4 = x_im * b_re */ \
523                 __ASM_EMIT("vmulps          0x20 + %[FFT_A], %%ymm7, %%ymm5")       /* ymm5 = x_im * b_im */ \
524                 __ASM_EMIT(FFT_FMA("vmulps  0x00 + %[FFT_A], %%ymm3, %%ymm3", ""))  /* ymm3 = x_re * b_re */ \
525                 __ASM_EMIT(FFT_FMA("vmulps  0x00 + %[FFT_A], %%ymm7, %%ymm7", ""))  /* ymm7 = x_re * b_im */ \
526                 __ASM_EMIT(FFT_FMA("vsubps  %%ymm5, %%ymm3, %%ymm5", "vfmsub231ps  0x00 + %[FFT_A], %%ymm3, %%ymm5"))       /* ymm5 = c_re = x_re * b_re - x_im * b_im */ \
527                 __ASM_EMIT(FFT_FMA("vaddps  %%ymm4, %%ymm7, %%ymm4", "vfmadd231ps  0x00 + %[FFT_A], %%ymm7, %%ymm4"))       /* ymm4 = c_im = x_re * b_im + x_im * b_re */ \
528                 __ASM_EMIT("vsubps          %%ymm5, %%ymm2, %%ymm0")                /* ymm0 = a_re - c_re */ \
529                 __ASM_EMIT("vsubps          %%ymm4, %%ymm6, %%ymm1")                /* ymm1 = a_im - c_im */ \
530                 __ASM_EMIT("vaddps          %%ymm5, %%ymm2, %%ymm2")                /* ymm2 = a_re + c_re */ \
531                 __ASM_EMIT("vaddps          %%ymm4, %%ymm6, %%ymm3")                /* ymm3 = a_im + c_im */ \
532                 /* Store */
533                 __ASM_EMIT("vmovups         %%xmm2, 0x00(%[dst_re])")
534                 __ASM_EMIT("vmovups         %%xmm0, 0x10(%[dst_re])")
535                 __ASM_EMIT("vextractf128    $1, %%ymm2, 0x20(%[dst_re])")
536                 __ASM_EMIT("vextractf128    $1, %%ymm0, 0x30(%[dst_re])")
537                 __ASM_EMIT("vmovups         %%xmm3, 0x00(%[dst_im])")
538                 __ASM_EMIT("vmovups         %%xmm1, 0x10(%[dst_im])")
539                 __ASM_EMIT("vextractf128    $1, %%ymm3, 0x20(%[dst_im])")
540                 __ASM_EMIT("vextractf128    $1, %%ymm1, 0x30(%[dst_im])")
541                 __ASM_EMIT("add             $0x40, %[dst_re]")
542                 __ASM_EMIT("add             $0x40, %[dst_im]")
543 
544                 : [dst_re] "+r" (dst_re), [dst_im] "+r"(dst_im), [index] "+r"(index)
545                 : [src_re] "r"(src_re), [src_im] "r"(src_im), [regs] __ASM_ARG_RO(regs),
546                   [FFT_A] "o" (FFT_A)
547                 : "cc", "memory",
548                   "%xmm0", "%xmm1", "%xmm2", "%xmm3",
549                   "%xmm4", "%xmm5", "%xmm6", "%xmm7"
550             );
551         }
552     }
553 }
554 
555 #undef FFT_SCRAMBLE_SELF_DIRECT_NAME
556 #undef FFT_SCRAMBLE_SELF_REVERSE_NAME
557 #undef FFT_SCRAMBLE_COPY_DIRECT_NAME
558 #undef FFT_SCRAMBLE_COPY_REVERSE_NAME
559 #undef FFT_TYPE
560 #undef FFT_FMA
561 
562