1 /* 2 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/> 3 * (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com> 4 * 5 * This file is part of lsp-plugins 6 * Created on: 9 дек. 2019 г. 7 * 8 * lsp-plugins is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Lesser General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * any later version. 12 * 13 * lsp-plugins is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public License 19 * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>. 20 */ 21 22 #ifndef DSP_ARCH_X86_AVX_IMPL 23 #error "This header should not be included directly" 24 #endif /* DSP_ARCH_X86_AVX_IMPL */ 25 26 namespace avx 27 { FFT_SCRAMBLE_SELF_DIRECT_NAME(float * dst_re,float * dst_im,size_t rank)28 static inline void FFT_SCRAMBLE_SELF_DIRECT_NAME(float *dst_re, float *dst_im, size_t rank) 29 { 30 // Calculate number of items 31 size_t items = (1 << rank) - 1; 32 33 for (size_t i = 1; i < items; ++i) 34 { 35 size_t j = reverse_bits(FFT_TYPE(i), rank); /* Reverse the order of the bits */ 36 if (i >= j) 37 continue; 38 39 /* Copy the values from the reversed position */ 40 ARCH_X86_ASM 41 ( 42 __ASM_EMIT("vmovss (%[dst_re], %[i], 4), %%xmm0") 43 __ASM_EMIT("vmovss (%[dst_im], %[i], 4), %%xmm1") 44 __ASM_EMIT("vmovss (%[dst_re], %[j], 4), %%xmm2") 45 __ASM_EMIT("vmovss (%[dst_im], %[j], 4), %%xmm3") 46 __ASM_EMIT("vmovss %%xmm2, (%[dst_re], %[i], 4)") 47 __ASM_EMIT("vmovss %%xmm3, (%[dst_im], %[i], 4)") 48 __ASM_EMIT("vmovss %%xmm0, (%[dst_re], %[j], 4)") 49 __ASM_EMIT("vmovss %%xmm1, (%[dst_im], %[j], 4)") 50 : 51 : [dst_re] "r" (dst_re), [dst_im] "r" (dst_im), 52 [i] "r"(i), [j] "r"(j) 53 : "memory", 54 "%xmm0", "%xmm1", "%xmm2", "%xmm3" 55 ); 56 } 57 58 // Perform butterfly 8x 59 size_t off = 0; 60 items = 1 << (rank - 3); 61 62 // Perform 4-element butterflies 63 ARCH_X86_ASM 64 ( 65 /* Loop 2x 4-element butterflies */ 66 __ASM_EMIT("sub $2, %[items]") 67 __ASM_EMIT("jb 2f") 68 __ASM_EMIT("1:") 69 /* Load data to registers */ 70 __ASM_EMIT("vmovups 0x00(%[dst_re], %[off]), %%xmm0") /* xmm0 = r0 r1 r2 r3 */ 71 __ASM_EMIT("vmovups 0x10(%[dst_re], %[off]), %%xmm4") /* xmm4 = r4 r5 r6 r7 */ 72 __ASM_EMIT("vinsertf128 $1, 0x20(%[dst_re], %[off]), %%ymm0, %%ymm0") /* ymm0 = r0 r1 r2 r3 */ 73 __ASM_EMIT("vinsertf128 $1, 0x30(%[dst_re], %[off]), %%ymm4, %%ymm4") /* ymm4 = r4 r5 r6 r7 */ 74 __ASM_EMIT("vmovups 0x00(%[dst_im], %[off]), %%xmm2") /* xmm2 = i0 i1 i2 i3 */ 75 __ASM_EMIT("vmovups 0x10(%[dst_im], %[off]), %%xmm6") /* xmm6 = i4 i5 i6 i7 */ 76 __ASM_EMIT("vinsertf128 $1, 0x20(%[dst_im], %[off]), %%ymm2, %%ymm2") /* ymm2 = i0 i1 i2 i3 */ 77 __ASM_EMIT("vinsertf128 $1, 0x30(%[dst_im], %[off]), %%ymm6, %%ymm6") /* ymm6 = i4 i5 i6 i7 */ 78 /* 1st-order 4x butterfly */ 79 __ASM_EMIT("vhsubps %%ymm4, %%ymm0, %%ymm1") /* ymm1 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' */ 80 __ASM_EMIT("vhsubps %%ymm6, %%ymm2, %%ymm3") /* ymm3 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' */ 81 __ASM_EMIT("vhaddps %%ymm4, %%ymm0, %%ymm0") /* ymm0 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' */ 82 __ASM_EMIT("vhaddps %%ymm6, %%ymm2, %%ymm2") /* ymm2 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' */ 83 /* 2nd-order 4x butterfly */ 84 __ASM_EMIT("vblendps $0xaa, %%ymm3, %%ymm1, %%ymm4") /* ymm4 = r1' i3' r5' i7' */ 85 __ASM_EMIT("vblendps $0xaa, %%ymm1, %%ymm3, %%ymm5") /* ymm5 = i1' r3' i5' r7' */ 86 __ASM_EMIT("vhsubps %%ymm4, %%ymm0, %%ymm1") /* ymm1 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r3" r7" */ 87 __ASM_EMIT("vhsubps %%ymm5, %%ymm2, %%ymm3") /* ymm3 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i1" i5" */ 88 __ASM_EMIT("vhaddps %%ymm4, %%ymm0, %%ymm0") /* ymm0 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r1" r5" */ 89 __ASM_EMIT("vhaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i3" i7" */ 90 __ASM_EMIT("vblendps $0xcc, %%ymm3, %%ymm2, %%ymm4") /* ymm4 = i0" i4" i1" i5" */ 91 __ASM_EMIT("vblendps $0xcc, %%ymm2, %%ymm3, %%ymm5") /* ymm5 = i2" i6" i3" i7" */ 92 __ASM_EMIT("vshufps $0x88, %%ymm1, %%ymm0, %%ymm2") /* ymm2 = r0" r1" r2" r3" */ 93 __ASM_EMIT("vshufps $0xdd, %%ymm1, %%ymm0, %%ymm3") /* ymm3 = r4" r5" r6" r7" */ 94 __ASM_EMIT("vshufps $0x88, %%ymm5, %%ymm4, %%ymm6") /* ymm6 = i0" i1" i2" i3" */ 95 __ASM_EMIT("vshufps $0xdd, %%ymm5, %%ymm4, %%ymm7") /* ymm7 = i4" i5" i6" i7" */ 96 /* 3rd-order 8x butterfly */ 97 __ASM_EMIT("vmulps 0x20 + %[FFT_A], %%ymm3, %%ymm4") /* ymm4 = x_im * b_re */ \ 98 __ASM_EMIT("vmulps 0x20 + %[FFT_A], %%ymm7, %%ymm5") /* ymm5 = x_im * b_im */ \ 99 __ASM_EMIT(FFT_FMA("vmulps 0x00 + %[FFT_A], %%ymm3, %%ymm3", "")) /* ymm3 = x_re * b_re */ \ 100 __ASM_EMIT(FFT_FMA("vmulps 0x00 + %[FFT_A], %%ymm7, %%ymm7", "")) /* ymm7 = x_re * b_im */ \ 101 __ASM_EMIT(FFT_FMA("vaddps %%ymm5, %%ymm3, %%ymm5", "vfmadd231ps 0x00 + %[FFT_A], %%ymm3, %%ymm5")) /* ymm5 = c_re = x_re * b_re + x_im * b_im */ \ 102 __ASM_EMIT(FFT_FMA("vsubps %%ymm4, %%ymm7, %%ymm4", "vfmsub231ps 0x00 + %[FFT_A], %%ymm7, %%ymm4")) /* ymm4 = c_im = x_re * b_im - x_im * b_re */ \ 103 __ASM_EMIT("vsubps %%ymm5, %%ymm2, %%ymm0") /* ymm0 = a_re - c_re */ \ 104 __ASM_EMIT("vsubps %%ymm4, %%ymm6, %%ymm1") /* ymm1 = a_im - c_im */ \ 105 __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = a_re + c_re */ \ 106 __ASM_EMIT("vaddps %%ymm4, %%ymm6, %%ymm3") /* ymm3 = a_im + c_im */ \ 107 /* Store */ 108 __ASM_EMIT("vmovups %%xmm2, 0x00(%[dst_re], %[off])") 109 __ASM_EMIT("vmovups %%xmm0, 0x10(%[dst_re], %[off])") 110 __ASM_EMIT("vextractf128 $1, %%ymm2, 0x20(%[dst_re], %[off])") 111 __ASM_EMIT("vextractf128 $1, %%ymm0, 0x30(%[dst_re], %[off])") 112 __ASM_EMIT("vmovups %%xmm3, 0x00(%[dst_im], %[off])") 113 __ASM_EMIT("vmovups %%xmm1, 0x10(%[dst_im], %[off])") 114 __ASM_EMIT("vextractf128 $1, %%ymm3, 0x20(%[dst_im], %[off])") 115 __ASM_EMIT("vextractf128 $1, %%ymm1, 0x30(%[dst_im], %[off])") 116 /* Move pointers and repeat*/ 117 __ASM_EMIT("add $0x40, %[off]") 118 __ASM_EMIT("sub $2, %[items]") 119 __ASM_EMIT("jae 1b") 120 __ASM_EMIT("2:") 121 /* x4 scramble block */ 122 __ASM_EMIT("add $1, %[items]") 123 __ASM_EMIT("jl 4f") 124 __ASM_EMIT("vmovups 0x00(%[dst_re], %[off]), %%xmm0") /* xmm0 = r0 r1 r2 r3 */ 125 __ASM_EMIT("vmovups 0x10(%[dst_re], %[off]), %%xmm4") /* xmm4 = r4 r5 r6 r7 */ 126 __ASM_EMIT("vmovups 0x00(%[dst_im], %[off]), %%xmm2") /* xmm2 = i0 i1 i2 i3 */ 127 __ASM_EMIT("vmovups 0x10(%[dst_im], %[off]), %%xmm6") /* xmm6 = i4 i5 i6 i7 */ 128 /* 1st-order 4x butterfly */ 129 __ASM_EMIT("vhsubps %%xmm4, %%xmm0, %%xmm1") /* xmm1 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' */ 130 __ASM_EMIT("vhsubps %%xmm6, %%xmm2, %%xmm3") /* xmm3 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' */ 131 __ASM_EMIT("vhaddps %%xmm4, %%xmm0, %%xmm0") /* xmm0 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' */ 132 __ASM_EMIT("vhaddps %%xmm6, %%xmm2, %%xmm2") /* xmm2 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' */ 133 /* 2nd-order 4x butterfly */ 134 __ASM_EMIT("vblendps $0xaa, %%xmm3, %%xmm1, %%xmm4") /* xmm4 = r1' i3' r5' i7' */ 135 __ASM_EMIT("vblendps $0xaa, %%xmm1, %%xmm3, %%xmm5") /* xmm5 = i1' r3' i5' r7' */ 136 __ASM_EMIT("vhsubps %%xmm4, %%xmm0, %%xmm1") /* xmm1 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r3" r7" */ 137 __ASM_EMIT("vhsubps %%xmm5, %%xmm2, %%xmm3") /* xmm3 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i1" i5" */ 138 __ASM_EMIT("vhaddps %%xmm4, %%xmm0, %%xmm0") /* xmm0 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r1" r5" */ 139 __ASM_EMIT("vhaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i3" i7" */ 140 __ASM_EMIT("vblendps $0xcc, %%xmm3, %%xmm2, %%xmm4") /* xmm4 = i0" i4" i1" i5" */ 141 __ASM_EMIT("vblendps $0xcc, %%xmm2, %%xmm3, %%xmm5") /* xmm5 = i2" i6" i3" i7" */ 142 __ASM_EMIT("vshufps $0x88, %%xmm1, %%xmm0, %%xmm2") /* xmm2 = r0" r1" r2" r3" */ 143 __ASM_EMIT("vshufps $0xdd, %%xmm1, %%xmm0, %%xmm3") /* xmm3 = r4" r5" r6" r7" */ 144 __ASM_EMIT("vshufps $0x88, %%xmm5, %%xmm4, %%xmm6") /* xmm6 = i0" i1" i2" i3" */ 145 __ASM_EMIT("vshufps $0xdd, %%xmm5, %%xmm4, %%xmm7") /* xmm7 = i4" i5" i6" i7" */ 146 /* 3rd-order 8x butterfly */ 147 __ASM_EMIT("vmulps 0x20 + %[FFT_A], %%xmm3, %%xmm4") /* xmm4 = x_im * b_re */ \ 148 __ASM_EMIT("vmulps 0x20 + %[FFT_A], %%xmm7, %%xmm5") /* xmm5 = x_im * b_im */ \ 149 __ASM_EMIT(FFT_FMA("vmulps 0x00 + %[FFT_A], %%xmm3, %%xmm3", "")) /* xmm3 = x_re * b_re */ \ 150 __ASM_EMIT(FFT_FMA("vmulps 0x00 + %[FFT_A], %%xmm7, %%xmm7", "")) /* xmm7 = x_re * b_im */ \ 151 __ASM_EMIT(FFT_FMA("vaddps %%xmm5, %%xmm3, %%xmm5", "vfmadd231ps 0x00 + %[FFT_A], %%xmm3, %%xmm5")) /* xmm5 = c_re = x_re * b_re + x_im * b_im */ \ 152 __ASM_EMIT(FFT_FMA("vsubps %%xmm4, %%xmm7, %%xmm4", "vfmsub231ps 0x00 + %[FFT_A], %%xmm7, %%xmm4")) /* xmm4 = c_im = x_re * b_im - x_im * b_re */ \ 153 __ASM_EMIT("vsubps %%xmm5, %%xmm2, %%xmm0") /* xmm0 = a_re - c_re */ \ 154 __ASM_EMIT("vsubps %%xmm4, %%xmm6, %%xmm1") /* xmm1 = a_im - c_im */ \ 155 __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = a_re + c_re */ \ 156 __ASM_EMIT("vaddps %%xmm4, %%xmm6, %%xmm3") /* xmm3 = a_im + c_im */ \ 157 /* Store */ 158 __ASM_EMIT("vmovups %%xmm2, 0x00(%[dst_re], %[off])") 159 __ASM_EMIT("vmovups %%xmm0, 0x10(%[dst_re], %[off])") 160 __ASM_EMIT("vmovups %%xmm3, 0x00(%[dst_im], %[off])") 161 __ASM_EMIT("vmovups %%xmm1, 0x10(%[dst_im], %[off])") 162 __ASM_EMIT("4:") 163 164 : [dst_re] "+r"(dst_re), [dst_im] "+r"(dst_im), 165 [off] "+r" (off), [items] "+r"(items) 166 : [FFT_A] "o" (FFT_A) 167 : "cc", "memory", 168 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 169 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 170 ); 171 } 172 FFT_SCRAMBLE_SELF_REVERSE_NAME(float * dst_re,float * dst_im,size_t rank)173 static inline void FFT_SCRAMBLE_SELF_REVERSE_NAME(float *dst_re, float *dst_im, size_t rank) 174 { 175 // Calculate number of items 176 size_t items = (1 << rank) - 1; 177 178 for (size_t i = 1; i < items; ++i) 179 { 180 size_t j = reverse_bits(FFT_TYPE(i), rank); /* Reverse the order of the bits */ 181 if (i >= j) 182 continue; 183 184 /* Copy the values from the reversed position */ 185 ARCH_X86_ASM 186 ( 187 __ASM_EMIT("vmovss (%[dst_re], %[i], 4), %%xmm0") 188 __ASM_EMIT("vmovss (%[dst_im], %[i], 4), %%xmm1") 189 __ASM_EMIT("vmovss (%[dst_re], %[j], 4), %%xmm2") 190 __ASM_EMIT("vmovss (%[dst_im], %[j], 4), %%xmm3") 191 __ASM_EMIT("vmovss %%xmm2, (%[dst_re], %[i], 4)") 192 __ASM_EMIT("vmovss %%xmm3, (%[dst_im], %[i], 4)") 193 __ASM_EMIT("vmovss %%xmm0, (%[dst_re], %[j], 4)") 194 __ASM_EMIT("vmovss %%xmm1, (%[dst_im], %[j], 4)") 195 : 196 : [dst_re] "r"(dst_re), [dst_im] "r"(dst_im), 197 [i] "r"(i), [j] "r"(j) 198 : "memory", 199 "%xmm0", "%xmm1", "%xmm2", "%xmm3" 200 ); 201 } 202 203 // Perform butterfly 8x 204 size_t off = 0; 205 items = 1 << (rank - 3); 206 207 // Perform 4-element butterflies 208 ARCH_X86_ASM 209 ( 210 /* Loop 2x 4-element butterflies */ 211 __ASM_EMIT("sub $2, %[items]") 212 __ASM_EMIT("jb 2f") 213 __ASM_EMIT("1:") 214 /* Load data to registers */ 215 __ASM_EMIT("vmovups 0x00(%[dst_re], %[off]), %%xmm0") /* xmm0 = r0 r1 r2 r3 */ 216 __ASM_EMIT("vmovups 0x10(%[dst_re], %[off]), %%xmm4") /* xmm4 = r4 r5 r6 r7 */ 217 __ASM_EMIT("vinsertf128 $1, 0x20(%[dst_re], %[off]), %%ymm0, %%ymm0") /* ymm0 = r0 r1 r2 r3 */ 218 __ASM_EMIT("vinsertf128 $1, 0x30(%[dst_re], %[off]), %%ymm4, %%ymm4") /* ymm4 = r4 r5 r6 r7 */ 219 __ASM_EMIT("vmovups 0x00(%[dst_im], %[off]), %%xmm2") /* xmm2 = i0 i1 i2 i3 */ 220 __ASM_EMIT("vmovups 0x10(%[dst_im], %[off]), %%xmm6") /* xmm6 = i4 i5 i6 i7 */ 221 __ASM_EMIT("vinsertf128 $1, 0x20(%[dst_im], %[off]), %%ymm2, %%ymm2") /* ymm2 = i0 i1 i2 i3 */ 222 __ASM_EMIT("vinsertf128 $1, 0x30(%[dst_im], %[off]), %%ymm6, %%ymm6") /* ymm6 = i4 i5 i6 i7 */ 223 /* 1st-order 4x butterfly */ 224 __ASM_EMIT("vhsubps %%ymm4, %%ymm0, %%ymm1") /* ymm1 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' */ 225 __ASM_EMIT("vhsubps %%ymm6, %%ymm2, %%ymm3") /* ymm3 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' */ 226 __ASM_EMIT("vhaddps %%ymm4, %%ymm0, %%ymm0") /* ymm0 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' */ 227 __ASM_EMIT("vhaddps %%ymm6, %%ymm2, %%ymm2") /* ymm2 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' */ 228 /* 2nd-order 4x butterfly */ 229 __ASM_EMIT("vblendps $0xaa, %%ymm3, %%ymm1, %%ymm4") /* ymm4 = r1' i3' r5' i7' */ 230 __ASM_EMIT("vblendps $0xaa, %%ymm1, %%ymm3, %%ymm5") /* ymm5 = i1' r3' i5' r7' */ 231 __ASM_EMIT("vhsubps %%ymm4, %%ymm0, %%ymm1") /* ymm1 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r1" r5" */ 232 __ASM_EMIT("vhsubps %%ymm5, %%ymm2, %%ymm3") /* ymm3 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i3" i7" */ 233 __ASM_EMIT("vhaddps %%ymm4, %%ymm0, %%ymm0") /* ymm0 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r3" r7" */ 234 __ASM_EMIT("vhaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i1" i5" */ 235 __ASM_EMIT("vblendps $0xcc, %%ymm1, %%ymm0, %%ymm4") /* ymm4 = r0" i4" r1" r5" */ 236 __ASM_EMIT("vblendps $0xcc, %%ymm0, %%ymm1, %%ymm5") /* ymm5 = r2" r6" r3" r7" */ 237 __ASM_EMIT("vshufps $0x88, %%ymm3, %%ymm2, %%ymm6") /* ymm6 = i0" i1" i2" i3" */ 238 __ASM_EMIT("vshufps $0xdd, %%ymm3, %%ymm2, %%ymm7") /* ymm7 = i4" i5" i6" i7" */ 239 __ASM_EMIT("vshufps $0x88, %%ymm5, %%ymm4, %%ymm2") /* ymm2 = r0" r1" r2" r3" */ 240 __ASM_EMIT("vshufps $0xdd, %%ymm5, %%ymm4, %%ymm3") /* ymm3 = r4" r5" r6" r7" */ 241 /* 3rd-order 8x butterfly */ 242 __ASM_EMIT("vmulps 0x20 + %[FFT_A], %%ymm3, %%ymm4") /* ymm4 = x_im * b_re */ \ 243 __ASM_EMIT("vmulps 0x20 + %[FFT_A], %%ymm7, %%ymm5") /* ymm5 = x_im * b_im */ \ 244 __ASM_EMIT(FFT_FMA("vmulps 0x00 + %[FFT_A], %%ymm3, %%ymm3", "")) /* ymm3 = x_re * b_re */ \ 245 __ASM_EMIT(FFT_FMA("vmulps 0x00 + %[FFT_A], %%ymm7, %%ymm7", "")) /* ymm7 = x_re * b_im */ \ 246 __ASM_EMIT(FFT_FMA("vsubps %%ymm5, %%ymm3, %%ymm5", "vfmsub231ps 0x00 + %[FFT_A], %%ymm3, %%ymm5")) /* ymm5 = c_re = x_re * b_re - x_im * b_im */ \ 247 __ASM_EMIT(FFT_FMA("vaddps %%ymm4, %%ymm7, %%ymm4", "vfmadd231ps 0x00 + %[FFT_A], %%ymm7, %%ymm4")) /* ymm4 = c_im = x_re * b_im + x_im * b_re */ \ 248 __ASM_EMIT("vsubps %%ymm5, %%ymm2, %%ymm0") /* ymm0 = a_re - c_re */ \ 249 __ASM_EMIT("vsubps %%ymm4, %%ymm6, %%ymm1") /* ymm1 = a_im - c_im */ \ 250 __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = a_re + c_re */ \ 251 __ASM_EMIT("vaddps %%ymm4, %%ymm6, %%ymm3") /* ymm3 = a_im + c_im */ \ 252 /* Store */ 253 __ASM_EMIT("vmovups %%xmm2, 0x00(%[dst_re], %[off])") 254 __ASM_EMIT("vmovups %%xmm0, 0x10(%[dst_re], %[off])") 255 __ASM_EMIT("vextractf128 $1, %%ymm2, 0x20(%[dst_re], %[off])") 256 __ASM_EMIT("vextractf128 $1, %%ymm0, 0x30(%[dst_re], %[off])") 257 __ASM_EMIT("vmovups %%xmm3, 0x00(%[dst_im], %[off])") 258 __ASM_EMIT("vmovups %%xmm1, 0x10(%[dst_im], %[off])") 259 __ASM_EMIT("vextractf128 $1, %%ymm3, 0x20(%[dst_im], %[off])") 260 __ASM_EMIT("vextractf128 $1, %%ymm1, 0x30(%[dst_im], %[off])") 261 /* Move pointers and repeat*/ 262 __ASM_EMIT("add $0x40, %[off]") 263 __ASM_EMIT("sub $2, %[items]") 264 __ASM_EMIT("jae 1b") 265 __ASM_EMIT("2:") 266 /* x4 scramble block */ 267 __ASM_EMIT("add $1, %[items]") 268 __ASM_EMIT("jl 4f") 269 __ASM_EMIT("vmovups 0x00(%[dst_re], %[off]), %%xmm0") /* xmm0 = r0 r1 r2 r3 */ 270 __ASM_EMIT("vmovups 0x10(%[dst_re], %[off]), %%xmm4") /* xmm4 = r4 r5 r6 r7 */ 271 __ASM_EMIT("vmovups 0x00(%[dst_im], %[off]), %%xmm2") /* xmm2 = i0 i1 i2 i3 */ 272 __ASM_EMIT("vmovups 0x10(%[dst_im], %[off]), %%xmm6") /* xmm6 = i4 i5 i6 i7 */ 273 /* 1st-order 4x butterfly */ 274 __ASM_EMIT("vhsubps %%xmm4, %%xmm0, %%xmm1") /* xmm1 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' */ 275 __ASM_EMIT("vhsubps %%xmm6, %%xmm2, %%xmm3") /* xmm3 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' */ 276 __ASM_EMIT("vhaddps %%xmm4, %%xmm0, %%xmm0") /* xmm0 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' */ 277 __ASM_EMIT("vhaddps %%xmm6, %%xmm2, %%xmm2") /* xmm2 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' */ 278 /* 2nd-order 4x butterfly */ 279 __ASM_EMIT("vblendps $0xaa, %%xmm3, %%xmm1, %%xmm4") /* xmm4 = r1' i3' r5' i7' */ 280 __ASM_EMIT("vblendps $0xaa, %%xmm1, %%xmm3, %%xmm5") /* xmm5 = i1' r3' i5' r7' */ 281 __ASM_EMIT("vhsubps %%xmm4, %%xmm0, %%xmm1") /* xmm1 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r1" r5" */ 282 __ASM_EMIT("vhsubps %%xmm5, %%xmm2, %%xmm3") /* xmm3 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i3" i7" */ 283 __ASM_EMIT("vhaddps %%xmm4, %%xmm0, %%xmm0") /* xmm0 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r3" r7" */ 284 __ASM_EMIT("vhaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i1" i5" */ 285 __ASM_EMIT("vblendps $0xcc, %%xmm1, %%xmm0, %%xmm4") /* xmm4 = r0" i4" r1" r5" */ 286 __ASM_EMIT("vblendps $0xcc, %%xmm0, %%xmm1, %%xmm5") /* xmm5 = r2" r6" r3" r7" */ 287 __ASM_EMIT("vshufps $0x88, %%xmm3, %%xmm2, %%xmm6") /* xmm6 = i0" i1" i2" i3" */ 288 __ASM_EMIT("vshufps $0xdd, %%xmm3, %%xmm2, %%xmm7") /* xmm7 = i4" i5" i6" i7" */ 289 __ASM_EMIT("vshufps $0x88, %%xmm5, %%xmm4, %%xmm2") /* xmm2 = r0" r1" r2" r3" */ 290 __ASM_EMIT("vshufps $0xdd, %%xmm5, %%xmm4, %%xmm3") /* xmm3 = r4" r5" r6" r7" */ 291 /* 3rd-order 8x butterfly */ 292 __ASM_EMIT("vmulps 0x20 + %[FFT_A], %%xmm3, %%xmm4") /* xmm4 = x_im * b_re */ \ 293 __ASM_EMIT("vmulps 0x20 + %[FFT_A], %%xmm7, %%xmm5") /* xmm5 = x_im * b_im */ \ 294 __ASM_EMIT(FFT_FMA("vmulps 0x00 + %[FFT_A], %%xmm3, %%xmm3", "")) /* xmm3 = x_re * b_re */ \ 295 __ASM_EMIT(FFT_FMA("vmulps 0x00 + %[FFT_A], %%xmm7, %%xmm7", "")) /* xmm7 = x_re * b_im */ \ 296 __ASM_EMIT(FFT_FMA("vsubps %%xmm5, %%xmm3, %%xmm5", "vfmsub231ps 0x00 + %[FFT_A], %%xmm3, %%xmm5")) /* xmm5 = c_re = x_re * b_re - x_im * b_im */ \ 297 __ASM_EMIT(FFT_FMA("vaddps %%xmm4, %%xmm7, %%xmm4", "vfmadd231ps 0x00 + %[FFT_A], %%xmm7, %%xmm4")) /* xmm4 = c_im = x_re * b_im + x_im * b_re */ \ 298 __ASM_EMIT("vsubps %%xmm5, %%xmm2, %%xmm0") /* xmm0 = a_re - c_re */ \ 299 __ASM_EMIT("vsubps %%xmm4, %%xmm6, %%xmm1") /* xmm1 = a_im - c_im */ \ 300 __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = a_re + c_re */ \ 301 __ASM_EMIT("vaddps %%xmm4, %%xmm6, %%xmm3") /* xmm3 = a_im + c_im */ \ 302 /* Store */ 303 __ASM_EMIT("vmovups %%xmm2, 0x00(%[dst_re], %[off])") 304 __ASM_EMIT("vmovups %%xmm0, 0x10(%[dst_re], %[off])") 305 __ASM_EMIT("vmovups %%xmm3, 0x00(%[dst_im], %[off])") 306 __ASM_EMIT("vmovups %%xmm1, 0x10(%[dst_im], %[off])") 307 __ASM_EMIT("4:") 308 309 : [dst_re] "+r"(dst_re), [dst_im] "+r"(dst_im), 310 [off] "+r" (off), [items] "+r"(items) 311 : [FFT_A] "o" (FFT_A) 312 : "cc", "memory", 313 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 314 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 315 ); 316 } 317 FFT_SCRAMBLE_COPY_DIRECT_NAME(float * dst_re,float * dst_im,const float * src_re,const float * src_im,size_t rank)318 static inline void FFT_SCRAMBLE_COPY_DIRECT_NAME(float *dst_re, float *dst_im, const float *src_re, const float *src_im, size_t rank) 319 { 320 size_t regs = 1 << rank; 321 322 for (size_t i=0; i<regs; ++i) 323 { 324 size_t index = reverse_bits(FFT_TYPE(i), rank); 325 326 ARCH_X86_ASM 327 ( 328 /* Load scalar values */ 329 __ASM_EMIT("vinsertps $0x00, (%[src_re], %[index], 4), %%xmm0, %%xmm0") /* xmm0 = r0 x x x */ 330 __ASM_EMIT("vinsertps $0x00, (%[src_im], %[index], 4), %%xmm2, %%xmm2") /* xmm2 = i0 x x x */ 331 __ASM_EMIT("add %[regs], %[index]") 332 __ASM_EMIT("vinsertps $0x00, (%[src_re], %[index], 4), %%xmm1, %%xmm1") /* xmm1 = r8 x x x */ 333 __ASM_EMIT("vinsertps $0x00, (%[src_im], %[index], 4), %%xmm3, %%xmm3") /* xmm3 = i8 x x x */ 334 __ASM_EMIT("add %[regs], %[index]") 335 __ASM_EMIT("vinsertps $0x00, (%[src_re], %[index], 4), %%xmm4, %%xmm4") /* xmm4 = r4 x x x */ 336 __ASM_EMIT("vinsertps $0x00, (%[src_im], %[index], 4), %%xmm6, %%xmm6") /* xmm6 = i4 x x x */ 337 __ASM_EMIT("add %[regs], %[index]") 338 __ASM_EMIT("vinsertps $0x00, (%[src_re], %[index], 4), %%xmm5, %%xmm5") /* xmm5 = r12 x x x */ 339 __ASM_EMIT("vinsertps $0x00, (%[src_im], %[index], 4), %%xmm7, %%xmm7") /* xmm7 = i12 x x x */ 340 __ASM_EMIT("add %[regs], %[index]") 341 342 __ASM_EMIT("vinsertps $0x20, (%[src_re], %[index], 4), %%xmm0, %%xmm0") /* xmm0 = r0 x r2 x */ 343 __ASM_EMIT("vinsertps $0x20, (%[src_im], %[index], 4), %%xmm2, %%xmm2") /* xmm2 = i0 x i2 x */ 344 __ASM_EMIT("add %[regs], %[index]") 345 __ASM_EMIT("vinsertps $0x20, (%[src_re], %[index], 4), %%xmm1, %%xmm1") /* xmm1 = r8 x r10 x */ 346 __ASM_EMIT("vinsertps $0x20, (%[src_im], %[index], 4), %%xmm3, %%xmm3") /* xmm3 = i8 x i10 x */ 347 __ASM_EMIT("add %[regs], %[index]") 348 __ASM_EMIT("vinsertps $0x20, (%[src_re], %[index], 4), %%xmm4, %%xmm4") /* xmm4 = r4 x r6 x */ 349 __ASM_EMIT("vinsertps $0x20, (%[src_im], %[index], 4), %%xmm6, %%xmm6") /* xmm6 = i4 x i6 x */ 350 __ASM_EMIT("add %[regs], %[index]") 351 __ASM_EMIT("vinsertps $0x20, (%[src_re], %[index], 4), %%xmm5, %%xmm5") /* xmm5 = r12 x r14 x */ 352 __ASM_EMIT("vinsertps $0x20, (%[src_im], %[index], 4), %%xmm7, %%xmm7") /* xmm7 = i12 x i14 x */ 353 __ASM_EMIT("add %[regs], %[index]") 354 355 __ASM_EMIT("vinsertps $0x10, (%[src_re], %[index], 4), %%xmm0, %%xmm0") /* xmm0 = r0 r1 r2 x */ 356 __ASM_EMIT("vinsertps $0x10, (%[src_im], %[index], 4), %%xmm2, %%xmm2") /* xmm2 = i0 i1 i2 x */ 357 __ASM_EMIT("add %[regs], %[index]") 358 __ASM_EMIT("vinsertps $0x10, (%[src_re], %[index], 4), %%xmm1, %%xmm1") /* xmm1 = r8 r9 r10 x */ 359 __ASM_EMIT("vinsertps $0x10, (%[src_im], %[index], 4), %%xmm3, %%xmm3") /* xmm3 = i8 i9 i10 x */ 360 __ASM_EMIT("add %[regs], %[index]") 361 __ASM_EMIT("vinsertps $0x10, (%[src_re], %[index], 4), %%xmm4, %%xmm4") /* xmm4 = r4 r5 r6 x */ 362 __ASM_EMIT("vinsertps $0x10, (%[src_im], %[index], 4), %%xmm6, %%xmm6") /* xmm6 = i4 i5 i6 x */ 363 __ASM_EMIT("add %[regs], %[index]") 364 __ASM_EMIT("vinsertps $0x10, (%[src_re], %[index], 4), %%xmm5, %%xmm5") /* xmm5 = r12 r13 r14 x */ 365 __ASM_EMIT("vinsertps $0x10, (%[src_im], %[index], 4), %%xmm7, %%xmm7") /* xmm7 = i12 i13 i14 x */ 366 __ASM_EMIT("add %[regs], %[index]") 367 368 __ASM_EMIT("vinsertps $0x30, (%[src_re], %[index], 4), %%xmm0, %%xmm0") /* xmm0 = r0 r1 r2 r3 */ 369 __ASM_EMIT("vinsertps $0x30, (%[src_im], %[index], 4), %%xmm2, %%xmm2") /* xmm2 = i0 i1 i2 i3 */ 370 __ASM_EMIT("add %[regs], %[index]") 371 __ASM_EMIT("vinsertps $0x30, (%[src_re], %[index], 4), %%xmm1, %%xmm1") /* xmm1 = r8 r9 r10 r11 */ 372 __ASM_EMIT("vinsertps $0x30, (%[src_im], %[index], 4), %%xmm3, %%xmm3") /* xmm3 = i8 i9 i10 i11 */ 373 __ASM_EMIT("add %[regs], %[index]") 374 __ASM_EMIT("vinsertps $0x30, (%[src_re], %[index], 4), %%xmm4, %%xmm4") /* xmm4 = r4 r5 r6 r7 */ 375 __ASM_EMIT("vinsertps $0x30, (%[src_im], %[index], 4), %%xmm6, %%xmm6") /* xmm6 = i4 i5 i6 i7 */ 376 __ASM_EMIT("add %[regs], %[index]") 377 __ASM_EMIT("vinsertps $0x30, (%[src_re], %[index], 4), %%xmm5, %%xmm5") /* xmm5 = r12 r13 r14 r15 */ 378 __ASM_EMIT("vinsertps $0x30, (%[src_im], %[index], 4), %%xmm7, %%xmm7") /* xmm7 = i12 i13 i14 i15 */ 379 __ASM_EMIT("add %[regs], %[index]") 380 381 __ASM_EMIT("vinsertf128 $1, %%xmm1, %%ymm0, %%ymm0") /* ymm0 = r0 r1 r2 r3 ... */ 382 __ASM_EMIT("vinsertf128 $1, %%xmm3, %%ymm2, %%ymm2") /* ymm2 = i0 i1 i2 i3 ... */ 383 __ASM_EMIT("vinsertf128 $1, %%xmm5, %%ymm4, %%ymm4") /* ymm4 = r4 r5 r6 r7 ... */ 384 __ASM_EMIT("vinsertf128 $1, %%xmm7, %%ymm6, %%ymm6") /* ymm0 = i4 i5 i6 i7 ... */ 385 /* 1st-order 4x butterfly */ 386 __ASM_EMIT("vhsubps %%ymm4, %%ymm0, %%ymm1") /* ymm1 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' */ 387 __ASM_EMIT("vhsubps %%ymm6, %%ymm2, %%ymm3") /* ymm3 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' */ 388 __ASM_EMIT("vhaddps %%ymm4, %%ymm0, %%ymm0") /* ymm0 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' */ 389 __ASM_EMIT("vhaddps %%ymm6, %%ymm2, %%ymm2") /* ymm2 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' */ 390 /* 2nd-order 4x butterfly */ 391 __ASM_EMIT("vblendps $0xaa, %%ymm3, %%ymm1, %%ymm4") /* ymm4 = r1' i3' r5' i7' */ 392 __ASM_EMIT("vblendps $0xaa, %%ymm1, %%ymm3, %%ymm5") /* ymm5 = i1' r3' i5' r7' */ 393 __ASM_EMIT("vhsubps %%ymm4, %%ymm0, %%ymm1") /* ymm1 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r3" r7" */ 394 __ASM_EMIT("vhsubps %%ymm5, %%ymm2, %%ymm3") /* ymm3 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i1" i5" */ 395 __ASM_EMIT("vhaddps %%ymm4, %%ymm0, %%ymm0") /* ymm0 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r1" r5" */ 396 __ASM_EMIT("vhaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i3" i7" */ 397 __ASM_EMIT("vblendps $0xcc, %%ymm3, %%ymm2, %%ymm4") /* ymm4 = i0" i4" i1" i5" */ 398 __ASM_EMIT("vblendps $0xcc, %%ymm2, %%ymm3, %%ymm5") /* ymm5 = i2" i6" i3" i7" */ 399 __ASM_EMIT("vshufps $0x88, %%ymm1, %%ymm0, %%ymm2") /* ymm2 = r0" r1" r2" r3" */ 400 __ASM_EMIT("vshufps $0xdd, %%ymm1, %%ymm0, %%ymm3") /* ymm3 = r4" r5" r6" r7" */ 401 __ASM_EMIT("vshufps $0x88, %%ymm5, %%ymm4, %%ymm6") /* ymm6 = i0" i1" i2" i3" */ 402 __ASM_EMIT("vshufps $0xdd, %%ymm5, %%ymm4, %%ymm7") /* ymm7 = i4" i5" i6" i7" */ 403 /* 3rd-order 8x butterfly */ 404 __ASM_EMIT("vmulps 0x20 + %[FFT_A], %%ymm3, %%ymm4") /* ymm4 = x_im * b_re */ \ 405 __ASM_EMIT("vmulps 0x20 + %[FFT_A], %%ymm7, %%ymm5") /* ymm5 = x_im * b_im */ \ 406 __ASM_EMIT(FFT_FMA("vmulps 0x00 + %[FFT_A], %%ymm3, %%ymm3", "")) /* ymm3 = x_re * b_re */ \ 407 __ASM_EMIT(FFT_FMA("vmulps 0x00 + %[FFT_A], %%ymm7, %%ymm7", "")) /* ymm7 = x_re * b_im */ \ 408 __ASM_EMIT(FFT_FMA("vaddps %%ymm5, %%ymm3, %%ymm5", "vfmadd231ps 0x00 + %[FFT_A], %%ymm3, %%ymm5")) /* ymm5 = c_re = x_re * b_re + x_im * b_im */ \ 409 __ASM_EMIT(FFT_FMA("vsubps %%ymm4, %%ymm7, %%ymm4", "vfmsub231ps 0x00 + %[FFT_A], %%ymm7, %%ymm4")) /* ymm4 = c_im = x_re * b_im - x_im * b_re */ \ 410 __ASM_EMIT("vsubps %%ymm5, %%ymm2, %%ymm0") /* ymm0 = a_re - c_re */ \ 411 __ASM_EMIT("vsubps %%ymm4, %%ymm6, %%ymm1") /* ymm1 = a_im - c_im */ \ 412 __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = a_re + c_re */ \ 413 __ASM_EMIT("vaddps %%ymm4, %%ymm6, %%ymm3") /* ymm3 = a_im + c_im */ \ 414 /* Store */ 415 __ASM_EMIT("vmovups %%xmm2, 0x00(%[dst_re])") 416 __ASM_EMIT("vmovups %%xmm0, 0x10(%[dst_re])") 417 __ASM_EMIT("vextractf128 $1, %%ymm2, 0x20(%[dst_re])") 418 __ASM_EMIT("vextractf128 $1, %%ymm0, 0x30(%[dst_re])") 419 __ASM_EMIT("vmovups %%xmm3, 0x00(%[dst_im])") 420 __ASM_EMIT("vmovups %%xmm1, 0x10(%[dst_im])") 421 __ASM_EMIT("vextractf128 $1, %%ymm3, 0x20(%[dst_im])") 422 __ASM_EMIT("vextractf128 $1, %%ymm1, 0x30(%[dst_im])") 423 __ASM_EMIT("add $0x40, %[dst_re]") 424 __ASM_EMIT("add $0x40, %[dst_im]") 425 426 : [dst_re] "+r" (dst_re), [dst_im] "+r"(dst_im), [index] "+r"(index) 427 : [src_re] "r" (src_re), [src_im] "r"(src_im), [regs] __ASM_ARG_RO(regs), 428 [FFT_A] "o" (FFT_A) 429 : "cc", "memory", 430 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 431 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 432 ); 433 } 434 } 435 FFT_SCRAMBLE_COPY_REVERSE_NAME(float * dst_re,float * dst_im,const float * src_re,const float * src_im,size_t rank)436 static inline void FFT_SCRAMBLE_COPY_REVERSE_NAME(float *dst_re, float *dst_im, const float *src_re, const float *src_im, size_t rank) 437 { 438 size_t regs = 1 << rank; 439 440 for (size_t i=0; i<regs; ++i) 441 { 442 size_t index = reverse_bits(FFT_TYPE(i), rank); 443 444 ARCH_X86_ASM 445 ( 446 /* Load scalar values */ 447 __ASM_EMIT("vinsertps $0x00, (%[src_re], %[index], 4), %%xmm0, %%xmm0") /* xmm0 = r0 x x x */ 448 __ASM_EMIT("vinsertps $0x00, (%[src_im], %[index], 4), %%xmm2, %%xmm2") /* xmm2 = i0 x x x */ 449 __ASM_EMIT("add %[regs], %[index]") 450 __ASM_EMIT("vinsertps $0x00, (%[src_re], %[index], 4), %%xmm1, %%xmm1") /* xmm1 = r8 x x x */ 451 __ASM_EMIT("vinsertps $0x00, (%[src_im], %[index], 4), %%xmm3, %%xmm3") /* xmm3 = i8 x x x */ 452 __ASM_EMIT("add %[regs], %[index]") 453 __ASM_EMIT("vinsertps $0x00, (%[src_re], %[index], 4), %%xmm4, %%xmm4") /* xmm4 = r4 x x x */ 454 __ASM_EMIT("vinsertps $0x00, (%[src_im], %[index], 4), %%xmm6, %%xmm6") /* xmm6 = i4 x x x */ 455 __ASM_EMIT("add %[regs], %[index]") 456 __ASM_EMIT("vinsertps $0x00, (%[src_re], %[index], 4), %%xmm5, %%xmm5") /* xmm5 = r12 x x x */ 457 __ASM_EMIT("vinsertps $0x00, (%[src_im], %[index], 4), %%xmm7, %%xmm7") /* xmm7 = i12 x x x */ 458 __ASM_EMIT("add %[regs], %[index]") 459 460 __ASM_EMIT("vinsertps $0x20, (%[src_re], %[index], 4), %%xmm0, %%xmm0") /* xmm0 = r0 x r2 x */ 461 __ASM_EMIT("vinsertps $0x20, (%[src_im], %[index], 4), %%xmm2, %%xmm2") /* xmm2 = i0 x i2 x */ 462 __ASM_EMIT("add %[regs], %[index]") 463 __ASM_EMIT("vinsertps $0x20, (%[src_re], %[index], 4), %%xmm1, %%xmm1") /* xmm1 = r8 x r10 x */ 464 __ASM_EMIT("vinsertps $0x20, (%[src_im], %[index], 4), %%xmm3, %%xmm3") /* xmm3 = i8 x i10 x */ 465 __ASM_EMIT("add %[regs], %[index]") 466 __ASM_EMIT("vinsertps $0x20, (%[src_re], %[index], 4), %%xmm4, %%xmm4") /* xmm4 = r4 x r6 x */ 467 __ASM_EMIT("vinsertps $0x20, (%[src_im], %[index], 4), %%xmm6, %%xmm6") /* xmm6 = i4 x i6 x */ 468 __ASM_EMIT("add %[regs], %[index]") 469 __ASM_EMIT("vinsertps $0x20, (%[src_re], %[index], 4), %%xmm5, %%xmm5") /* xmm5 = r12 x r14 x */ 470 __ASM_EMIT("vinsertps $0x20, (%[src_im], %[index], 4), %%xmm7, %%xmm7") /* xmm7 = i12 x i14 x */ 471 __ASM_EMIT("add %[regs], %[index]") 472 473 __ASM_EMIT("vinsertps $0x10, (%[src_re], %[index], 4), %%xmm0, %%xmm0") /* xmm0 = r0 r1 r2 x */ 474 __ASM_EMIT("vinsertps $0x10, (%[src_im], %[index], 4), %%xmm2, %%xmm2") /* xmm2 = i0 i1 i2 x */ 475 __ASM_EMIT("add %[regs], %[index]") 476 __ASM_EMIT("vinsertps $0x10, (%[src_re], %[index], 4), %%xmm1, %%xmm1") /* xmm1 = r8 r9 r10 x */ 477 __ASM_EMIT("vinsertps $0x10, (%[src_im], %[index], 4), %%xmm3, %%xmm3") /* xmm3 = i8 i9 i10 x */ 478 __ASM_EMIT("add %[regs], %[index]") 479 __ASM_EMIT("vinsertps $0x10, (%[src_re], %[index], 4), %%xmm4, %%xmm4") /* xmm4 = r4 r5 r6 x */ 480 __ASM_EMIT("vinsertps $0x10, (%[src_im], %[index], 4), %%xmm6, %%xmm6") /* xmm6 = i4 i5 i6 x */ 481 __ASM_EMIT("add %[regs], %[index]") 482 __ASM_EMIT("vinsertps $0x10, (%[src_re], %[index], 4), %%xmm5, %%xmm5") /* xmm5 = r12 r13 r14 x */ 483 __ASM_EMIT("vinsertps $0x10, (%[src_im], %[index], 4), %%xmm7, %%xmm7") /* xmm7 = i12 i13 i14 x */ 484 __ASM_EMIT("add %[regs], %[index]") 485 486 __ASM_EMIT("vinsertps $0x30, (%[src_re], %[index], 4), %%xmm0, %%xmm0") /* xmm0 = r0 r1 r2 r3 */ 487 __ASM_EMIT("vinsertps $0x30, (%[src_im], %[index], 4), %%xmm2, %%xmm2") /* xmm2 = i0 i1 i2 i3 */ 488 __ASM_EMIT("add %[regs], %[index]") 489 __ASM_EMIT("vinsertps $0x30, (%[src_re], %[index], 4), %%xmm1, %%xmm1") /* xmm1 = r8 r9 r10 r11 */ 490 __ASM_EMIT("vinsertps $0x30, (%[src_im], %[index], 4), %%xmm3, %%xmm3") /* xmm3 = i8 i9 i10 i11 */ 491 __ASM_EMIT("add %[regs], %[index]") 492 __ASM_EMIT("vinsertps $0x30, (%[src_re], %[index], 4), %%xmm4, %%xmm4") /* xmm4 = r4 r5 r6 r7 */ 493 __ASM_EMIT("vinsertps $0x30, (%[src_im], %[index], 4), %%xmm6, %%xmm6") /* xmm6 = i4 i5 i6 i7 */ 494 __ASM_EMIT("add %[regs], %[index]") 495 __ASM_EMIT("vinsertps $0x30, (%[src_re], %[index], 4), %%xmm5, %%xmm5") /* xmm5 = r12 r13 r14 r15 */ 496 __ASM_EMIT("vinsertps $0x30, (%[src_im], %[index], 4), %%xmm7, %%xmm7") /* xmm7 = i12 i13 i14 i15 */ 497 __ASM_EMIT("add %[regs], %[index]") 498 499 __ASM_EMIT("vinsertf128 $1, %%xmm1, %%ymm0, %%ymm0") /* ymm0 = r0 r1 r2 r3 ... */ 500 __ASM_EMIT("vinsertf128 $1, %%xmm3, %%ymm2, %%ymm2") /* ymm2 = i0 i1 i2 i3 ... */ 501 __ASM_EMIT("vinsertf128 $1, %%xmm5, %%ymm4, %%ymm4") /* ymm4 = r4 r5 r6 r7 ... */ 502 __ASM_EMIT("vinsertf128 $1, %%xmm7, %%ymm6, %%ymm6") /* ymm0 = i4 i5 i6 i7 ... */ 503 /* 1st-order 4x butterfly */ 504 __ASM_EMIT("vhsubps %%ymm4, %%ymm0, %%ymm1") /* ymm1 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' */ 505 __ASM_EMIT("vhsubps %%ymm6, %%ymm2, %%ymm3") /* ymm3 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' */ 506 __ASM_EMIT("vhaddps %%ymm4, %%ymm0, %%ymm0") /* ymm0 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' */ 507 __ASM_EMIT("vhaddps %%ymm6, %%ymm2, %%ymm2") /* ymm2 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' */ 508 /* 2nd-order 4x butterfly */ 509 __ASM_EMIT("vblendps $0xaa, %%ymm3, %%ymm1, %%ymm4") /* ymm4 = r1' i3' r5' i7' */ 510 __ASM_EMIT("vblendps $0xaa, %%ymm1, %%ymm3, %%ymm5") /* ymm5 = i1' r3' i5' r7' */ 511 __ASM_EMIT("vhsubps %%ymm4, %%ymm0, %%ymm1") /* ymm1 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r1" r5" */ 512 __ASM_EMIT("vhsubps %%ymm5, %%ymm2, %%ymm3") /* ymm3 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i3" i7" */ 513 __ASM_EMIT("vhaddps %%ymm4, %%ymm0, %%ymm0") /* ymm0 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r3" r7" */ 514 __ASM_EMIT("vhaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i1" i5" */ 515 __ASM_EMIT("vblendps $0xcc, %%ymm1, %%ymm0, %%ymm4") /* ymm4 = r0" i4" r1" r5" */ 516 __ASM_EMIT("vblendps $0xcc, %%ymm0, %%ymm1, %%ymm5") /* ymm5 = r2" r6" r3" r7" */ 517 __ASM_EMIT("vshufps $0x88, %%ymm3, %%ymm2, %%ymm6") /* ymm6 = i0" i1" i2" i3" */ 518 __ASM_EMIT("vshufps $0xdd, %%ymm3, %%ymm2, %%ymm7") /* ymm7 = i4" i5" i6" i7" */ 519 __ASM_EMIT("vshufps $0x88, %%ymm5, %%ymm4, %%ymm2") /* ymm2 = r0" r1" r2" r3" */ 520 __ASM_EMIT("vshufps $0xdd, %%ymm5, %%ymm4, %%ymm3") /* ymm3 = r4" r5" r6" r7" */ 521 /* 3rd-order 8x butterfly */ 522 __ASM_EMIT("vmulps 0x20 + %[FFT_A], %%ymm3, %%ymm4") /* ymm4 = x_im * b_re */ \ 523 __ASM_EMIT("vmulps 0x20 + %[FFT_A], %%ymm7, %%ymm5") /* ymm5 = x_im * b_im */ \ 524 __ASM_EMIT(FFT_FMA("vmulps 0x00 + %[FFT_A], %%ymm3, %%ymm3", "")) /* ymm3 = x_re * b_re */ \ 525 __ASM_EMIT(FFT_FMA("vmulps 0x00 + %[FFT_A], %%ymm7, %%ymm7", "")) /* ymm7 = x_re * b_im */ \ 526 __ASM_EMIT(FFT_FMA("vsubps %%ymm5, %%ymm3, %%ymm5", "vfmsub231ps 0x00 + %[FFT_A], %%ymm3, %%ymm5")) /* ymm5 = c_re = x_re * b_re - x_im * b_im */ \ 527 __ASM_EMIT(FFT_FMA("vaddps %%ymm4, %%ymm7, %%ymm4", "vfmadd231ps 0x00 + %[FFT_A], %%ymm7, %%ymm4")) /* ymm4 = c_im = x_re * b_im + x_im * b_re */ \ 528 __ASM_EMIT("vsubps %%ymm5, %%ymm2, %%ymm0") /* ymm0 = a_re - c_re */ \ 529 __ASM_EMIT("vsubps %%ymm4, %%ymm6, %%ymm1") /* ymm1 = a_im - c_im */ \ 530 __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = a_re + c_re */ \ 531 __ASM_EMIT("vaddps %%ymm4, %%ymm6, %%ymm3") /* ymm3 = a_im + c_im */ \ 532 /* Store */ 533 __ASM_EMIT("vmovups %%xmm2, 0x00(%[dst_re])") 534 __ASM_EMIT("vmovups %%xmm0, 0x10(%[dst_re])") 535 __ASM_EMIT("vextractf128 $1, %%ymm2, 0x20(%[dst_re])") 536 __ASM_EMIT("vextractf128 $1, %%ymm0, 0x30(%[dst_re])") 537 __ASM_EMIT("vmovups %%xmm3, 0x00(%[dst_im])") 538 __ASM_EMIT("vmovups %%xmm1, 0x10(%[dst_im])") 539 __ASM_EMIT("vextractf128 $1, %%ymm3, 0x20(%[dst_im])") 540 __ASM_EMIT("vextractf128 $1, %%ymm1, 0x30(%[dst_im])") 541 __ASM_EMIT("add $0x40, %[dst_re]") 542 __ASM_EMIT("add $0x40, %[dst_im]") 543 544 : [dst_re] "+r" (dst_re), [dst_im] "+r"(dst_im), [index] "+r"(index) 545 : [src_re] "r"(src_re), [src_im] "r"(src_im), [regs] __ASM_ARG_RO(regs), 546 [FFT_A] "o" (FFT_A) 547 : "cc", "memory", 548 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 549 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 550 ); 551 } 552 } 553 } 554 555 #undef FFT_SCRAMBLE_SELF_DIRECT_NAME 556 #undef FFT_SCRAMBLE_SELF_REVERSE_NAME 557 #undef FFT_SCRAMBLE_COPY_DIRECT_NAME 558 #undef FFT_SCRAMBLE_COPY_REVERSE_NAME 559 #undef FFT_TYPE 560 #undef FFT_FMA 561 562