1 /* 2 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/> 3 * (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com> 4 * 5 * This file is part of lsp-plugins 6 * Created on: 13 дек. 2019 г. 7 * 8 * lsp-plugins is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Lesser General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * any later version. 12 * 13 * lsp-plugins is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public License 19 * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>. 20 */ 21 22 #ifndef DSP_ARCH_X86_AVX_IMPL 23 #error "This header should not be included directly" 24 #endif /* DSP_ARCH_X86_AVX_IMPL */ 25 26 #define __SKIP(x) 27 28 // c = a - b 29 // a' = a + b 30 // b' = c * w 31 #define FASTCONV_DIRECT_BUTTERFLY_BODY8(add_re, add_im, FMA_SEL) \ 32 ARCH_X86_ASM \ 33 ( \ 34 /* Prepare angle */ \ 35 __ASM_EMIT("vmovaps 0x00(%[ak]), %%ymm6") /* ymm6 = x_re */ \ 36 __ASM_EMIT("vmovaps 0x20(%[ak]), %%ymm7") /* ymm7 = x_im */ \ 37 /* Start loop */ \ 38 __ASM_EMIT("1:") \ 39 __ASM_EMIT("vmovups 0x00(%[dst], %[off1]), %%ymm0") /* ymm0 = a_re */ \ 40 __ASM_EMIT("vmovups 0x20(%[dst], %[off1]), %%ymm1") /* ymm1 = a_im */ \ 41 __ASM_EMIT("vmovups 0x00(%[dst], %[off2]), %%ymm2") /* ymm2 = b_re */ \ 42 __ASM_EMIT("vmovups 0x20(%[dst], %[off2]), %%ymm3") /* ymm3 = b_im */ \ 43 /* Perform butterfly */ \ 44 __ASM_EMIT("vsubps %%ymm2, %%ymm0, %%ymm4") /* ymm4 = c_re = a_re - b_re */ \ 45 __ASM_EMIT("vsubps %%ymm3, %%ymm1, %%ymm5") /* ymm5 = c_im = a_im - b_im */ \ 46 __ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0") /* ymm0 = a_re' = a_re + b_re */ \ 47 __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") /* ymm1 = a_im' = a_im + b_im */ \ 48 __ASM_EMIT("vmulps %%ymm7, %%ymm4, %%ymm2") /* ymm2 = x_im * c_re */ \ 49 __ASM_EMIT("vmulps %%ymm7, %%ymm5, %%ymm3") /* ymm3 = x_im * c_im */ \ 50 __ASM_EMIT(FMA_SEL("vmulps %%ymm6, %%ymm4, %%ymm4", "")) /* ymm4 = x_re * c_re */ \ 51 __ASM_EMIT(FMA_SEL("vmulps %%ymm6, %%ymm5, %%ymm5", "")) /* ymm5 = x_re * c_im */ \ 52 __ASM_EMIT(FMA_SEL(add_re " %%ymm3, %%ymm4, %%ymm4", add_re " %%ymm6, %%ymm3, %%ymm4")) /* ymm4 = b_re = x_re * c_re +- x_im * c_im */ \ 53 __ASM_EMIT(FMA_SEL(add_im " %%ymm2, %%ymm5, %%ymm5", add_im " %%ymm6, %%ymm2, %%ymm5")) /* ymm5 = b_im = x_re * c_im -+ x_im * c_re */ \ 54 /* Store values */ \ 55 __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst], %[off1])") \ 56 __ASM_EMIT("vmovups %%ymm1, 0x20(%[dst], %[off1])") \ 57 __ASM_EMIT("vmovups %%ymm4, 0x00(%[dst], %[off2])") \ 58 __ASM_EMIT("vmovups %%ymm5, 0x20(%[dst], %[off2])") \ 59 __ASM_EMIT("add $0x40, %[off1]") \ 60 __ASM_EMIT("add $0x40, %[off2]") \ 61 __ASM_EMIT32("subl $8, %[np]") \ 62 __ASM_EMIT64("subq $8, %[np]") \ 63 __ASM_EMIT("jz 2f") \ 64 /* Rotate angle */ \ 65 __ASM_EMIT("vmovaps 0x00(%[wk]), %%ymm4") /* xmm4 = w_re */ \ 66 __ASM_EMIT("vmovaps 0x20(%[wk]), %%ymm5") /* xmm5 = w_im */ \ 67 __ASM_EMIT("vmulps %%ymm5, %%ymm6, %%ymm2") /* ymm2 = w_im * x_re */ \ 68 __ASM_EMIT("vmulps %%ymm5, %%ymm7, %%ymm3") /* ymm3 = w_im * x_im */ \ 69 __ASM_EMIT(FMA_SEL("vmulps %%ymm4, %%ymm6, %%ymm6", "")) /* ymm6 = w_re * x_re */ \ 70 __ASM_EMIT(FMA_SEL("vmulps %%ymm4, %%ymm7, %%ymm7", "")) /* ymm7 = w_re * x_im */ \ 71 __ASM_EMIT(FMA_SEL("vsubps %%ymm3, %%ymm6, %%ymm6", "vfmsub132ps %%ymm4, %%ymm3, %%ymm6")) /* ymm6 = x_re' = w_re * x_re - w_im * x_im */ \ 72 __ASM_EMIT(FMA_SEL("vaddps %%ymm2, %%ymm7, %%ymm7", "vfmadd132ps %%ymm4, %%ymm2, %%ymm7")) /* ymm7 = x_im' = w_re * x_im + w_im * x_re */ \ 73 /* Repeat loop */ \ 74 __ASM_EMIT("jmp 1b") \ 75 __ASM_EMIT("2:") \ 76 \ 77 : [off1] "+r" (off1), [off2] "+r" (off2), [np] __ASM_ARG_RW(np) \ 78 : [dst] "r" (dst), [ak] "r" (ak), [wk] "r" (wk) \ 79 : "cc", "memory", \ 80 "%xmm0", "%xmm1", "%xmm2", "%xmm3", \ 81 "%xmm4", "%xmm5", "%xmm6", "%xmm7" \ 82 ); 83 84 85 #define FASTCONV_DIRECT_BUTTERFLY_LAST(add_re, add_im, FMA_SEL) \ 86 ARCH_X86_ASM( \ 87 /* Loop 2x 4-element butterflies */ \ 88 __ASM_EMIT("vmovaps 0x00 + %[FFT_A], %%ymm6") /* ymm6 = x_re */ \ 89 __ASM_EMIT("vmovaps 0x20 + %[FFT_A], %%ymm7") /* ymm7 = x_im */ \ 90 __ASM_EMIT("sub $2, %[nb]") \ 91 __ASM_EMIT("jb 2f") \ 92 __ASM_EMIT("1:") \ 93 /* Load data to registers */ \ 94 __ASM_EMIT("vmovups 0x00(%[dst]), %%xmm0") /* xmm0 = r0 r1 r2 r3 */ \ 95 __ASM_EMIT("vmovups 0x10(%[dst]), %%xmm2") /* xmm2 = r4 r5 r6 r7 */ \ 96 __ASM_EMIT("vmovups 0x20(%[dst]), %%xmm1") /* xmm1 = i0 i1 i2 i3 */ \ 97 __ASM_EMIT("vmovups 0x30(%[dst]), %%xmm3") /* xmm3 = i4 i5 i6 i7 */ \ 98 __ASM_EMIT("vinsertf128 $1, 0x40(%[dst]), %%ymm0, %%ymm0") /* ymm0 = a_re = r0 r1 r2 r3 r8 r9 r10 r11 */ \ 99 __ASM_EMIT("vinsertf128 $1, 0x50(%[dst]), %%ymm2, %%ymm2") /* ymm2 = b_re = r4 r5 r6 r7 r12 r13 r14 r15 */ \ 100 __ASM_EMIT("vinsertf128 $1, 0x60(%[dst]), %%ymm1, %%ymm1") /* ymm1 = a_im = i0 i1 i2 i3 i8 i9 i10 i11 */ \ 101 __ASM_EMIT("vinsertf128 $1, 0x70(%[dst]), %%ymm3, %%ymm3") /* ymm3 = b_im = i4 i5 i6 i7 i12 i13 i14 i15 */ \ 102 /* Perform 3rd-order butterflies */ \ 103 __ASM_EMIT("vsubps %%ymm2, %%ymm0, %%ymm4") /* ymm4 = c_re = a_re - b_re */ \ 104 __ASM_EMIT("vsubps %%ymm3, %%ymm1, %%ymm5") /* ymm5 = c_im = a_im - b_im */ \ 105 __ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0") /* ymm0 = a_re' = a_re + b_re */ \ 106 __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") /* ymm1 = a_im' = a_im + b_im */ \ 107 __ASM_EMIT("vmulps %%ymm7, %%ymm4, %%ymm2") /* ymm2 = x_im * c_re */ \ 108 __ASM_EMIT("vmulps %%ymm7, %%ymm5, %%ymm3") /* ymm3 = x_im * c_im */ \ 109 __ASM_EMIT(FMA_SEL("vmulps %%ymm6, %%ymm4, %%ymm4", "")) /* ymm4 = x_re * c_re */ \ 110 __ASM_EMIT(FMA_SEL("vmulps %%ymm6, %%ymm5, %%ymm5", "")) /* ymm5 = x_re * c_im */ \ 111 __ASM_EMIT(FMA_SEL(add_re " %%ymm3, %%ymm4, %%ymm4", add_re " %%ymm6, %%ymm3, %%ymm4")) /* ymm4 = b_re = x_re * c_re +- x_im * c_im */ \ 112 __ASM_EMIT(FMA_SEL(add_im " %%ymm2, %%ymm5, %%ymm5", add_im " %%ymm6, %%ymm2, %%ymm5")) /* ymm5 = b_im = x_re * c_im -+ x_im * c_re */ \ 113 /* 2nd-order butterflies */ \ 114 /* s0" = (r0 + r2) + j*(i0 + i2) + (r1 + r3) + j*(i1 + i3) */ \ 115 /* s1" = (r0 + r2) + j*(i0 + i2) - (r1 + r3) - j*(i1 + i3) */ \ 116 /* s2" = (r0 - r2) + j*(i0 - i2) + (i1 - i3) - j*(r1 - r3) */ \ 117 /* s3" = (r0 - r2) + j*(i0 - i2) - (i1 - i3) + j*(r1 - r3) */ \ 118 /* ymm0 = r0 r1 r2 r3 ... */ \ 119 /* ymm1 = i0 i1 i2 i3 ... */ \ 120 /* ymm4 = r4 r5 r6 r7 ... */ \ 121 /* ymm5 = i4 i5 i6 i7 ... */ \ 122 __ASM_EMIT("vshufps $0xd8, %%ymm0, %%ymm0, %%ymm0") /* ymm0 = r0 r2 r1 r3 */ \ 123 __ASM_EMIT("vshufps $0xd8, %%ymm1, %%ymm1, %%ymm1") /* ymm1 = i0 i2 i1 i3 */ \ 124 __ASM_EMIT("vshufps $0xd8, %%ymm4, %%ymm4, %%ymm4") \ 125 __ASM_EMIT("vshufps $0xd8, %%ymm5, %%ymm5, %%ymm5") \ 126 __ASM_EMIT("vhsubps %%ymm1, %%ymm0, %%ymm2") /* ymm2 = r0-r2 r1-r3 i0-i2 i1-i3 = r1' r3' i1' i3' */ \ 127 __ASM_EMIT("vhsubps %%ymm5, %%ymm4, %%ymm3") \ 128 __ASM_EMIT("vhaddps %%ymm1, %%ymm0, %%ymm0") /* ymm0 = r0+r2 r1+r3 i0+i2 i1+i3 = r0' r2' i0' i2' */ \ 129 __ASM_EMIT("vhaddps %%ymm5, %%ymm4, %%ymm4") \ 130 /* 1st-order 8x butterfly */ \ 131 __ASM_EMIT("vshufps $0x6e, %%ymm2, %%ymm0, %%ymm1") /* ymm0 = i0' i2' i1' r3' */ \ 132 __ASM_EMIT("vshufps $0x6e, %%ymm3, %%ymm4, %%ymm5") \ 133 __ASM_EMIT("vshufps $0xc4, %%ymm2, %%ymm0, %%ymm0") /* ymm1 = r0' r2' r1' i3' */ \ 134 __ASM_EMIT("vshufps $0xc4, %%ymm3, %%ymm4, %%ymm4") \ 135 __ASM_EMIT("vhsubps %%ymm1, %%ymm0, %%ymm2") /* ymm2 = r0'-r2' r1'-i3' i0'-i2' i1'-r3' = r1" r3" i1" i2" */ \ 136 __ASM_EMIT("vhsubps %%ymm5, %%ymm4, %%ymm3") \ 137 __ASM_EMIT("vhaddps %%ymm1, %%ymm0, %%ymm0") /* ymm0 = r0'+r2' r1'+i3' i0'+i2' i1'+r3' = r0" r2" i0" i3" */ \ 138 __ASM_EMIT("vhaddps %%ymm5, %%ymm4, %%ymm4") \ 139 __ASM_EMIT("vblendps $0x88, %%ymm0, %%ymm2, %%ymm1") /* ymm1 = r1" r3" i1" i3" */ \ 140 __ASM_EMIT("vblendps $0x88, %%ymm4, %%ymm3, %%ymm5") \ 141 __ASM_EMIT("vblendps $0x88, %%ymm2, %%ymm0, %%ymm0") /* ymm0 = r0" r2" i0" i2" */ \ 142 __ASM_EMIT("vblendps $0x88, %%ymm3, %%ymm4, %%ymm4") \ 143 __ASM_EMIT("vunpckhps %%ymm1, %%ymm0, %%ymm3") /* ymm3 = r0" r1" r2" r3" */ \ 144 __ASM_EMIT("vunpcklps %%ymm1, %%ymm0, %%ymm2") /* ymm2 = i0" i1" i2" i3" */ \ 145 __ASM_EMIT("vunpckhps %%ymm5, %%ymm4, %%ymm1") \ 146 __ASM_EMIT("vunpcklps %%ymm5, %%ymm4, %%ymm0") \ 147 /* Store */ \ 148 __ASM_EMIT("vmovups %%xmm2, 0x00(%[dst])") \ 149 __ASM_EMIT("vmovups %%xmm0, 0x10(%[dst])") \ 150 __ASM_EMIT("vmovups %%xmm3, 0x20(%[dst])") \ 151 __ASM_EMIT("vmovups %%xmm1, 0x30(%[dst])") \ 152 __ASM_EMIT("vextractf128 $1, %%ymm2, 0x40(%[dst])") \ 153 __ASM_EMIT("vextractf128 $1, %%ymm0, 0x50(%[dst])") \ 154 __ASM_EMIT("vextractf128 $1, %%ymm3, 0x60(%[dst])") \ 155 __ASM_EMIT("vextractf128 $1, %%ymm1, 0x70(%[dst])") \ 156 /* Move pointers and repeat*/ \ 157 __ASM_EMIT("add $0x80, %[dst]") \ 158 __ASM_EMIT("sub $2, %[nb]") \ 159 __ASM_EMIT("jae 1b") \ 160 __ASM_EMIT("2:") \ 161 /* 1x 4-element butterflies */ \ 162 __ASM_EMIT("add $1, %[nb]") \ 163 __ASM_EMIT("jl 4f") \ 164 __ASM_EMIT("vmovups 0x00(%[dst]), %%xmm0") /* xmm0 = r0 r1 r2 r3 */ \ 165 __ASM_EMIT("vmovups 0x10(%[dst]), %%xmm2") /* xmm2 = r4 r5 r6 r7 */ \ 166 __ASM_EMIT("vmovups 0x20(%[dst]), %%xmm1") /* xmm1 = i0 i1 i2 i3 */ \ 167 __ASM_EMIT("vmovups 0x30(%[dst]), %%xmm3") /* xmm3 = i4 i5 i6 i7 */ \ 168 /* Perform 3rd-order 8x butterfly */ \ 169 __ASM_EMIT("vsubps %%xmm2, %%xmm0, %%xmm4") /* xmm4 = c_re = a_re - b_re */ \ 170 __ASM_EMIT("vsubps %%xmm3, %%xmm1, %%xmm5") /* xmm5 = c_im = a_im - b_im */ \ 171 __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0") /* xmm0 = a_re' = a_re + b_re */ \ 172 __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = a_im' = a_im + b_im */ \ 173 __ASM_EMIT("vmulps %%xmm7, %%xmm4, %%xmm2") /* xmm2 = x_im * c_re */ \ 174 __ASM_EMIT("vmulps %%xmm7, %%xmm5, %%xmm3") /* xmm3 = x_im * c_im */ \ 175 __ASM_EMIT(FMA_SEL("vmulps %%xmm6, %%xmm4, %%xmm4", "")) /* xmm4 = x_re * c_re */ \ 176 __ASM_EMIT(FMA_SEL("vmulps %%xmm6, %%xmm5, %%xmm5", "")) /* xmm5 = x_re * c_im */ \ 177 __ASM_EMIT(FMA_SEL(add_re " %%xmm3, %%xmm4, %%xmm4", add_re " %%xmm6, %%xmm3, %%xmm4")) /* xmm4 = b_re = x_re * c_re +- x_im * c_im */ \ 178 __ASM_EMIT(FMA_SEL(add_im " %%xmm2, %%xmm5, %%xmm5", add_im " %%xmm6, %%xmm2, %%xmm5")) /* xmm5 = b_im = x_re * c_im -+ x_im * c_re */ \ 179 /* 2nd-order butterflies */ \ 180 /* s0" = (r0 + r2) + j*(i0 + i2) + (r1 + r3) + j*(i1 + i3) */ \ 181 /* s1" = (r0 + r2) + j*(i0 + i2) - (r1 + r3) - j*(i1 + i3) */ \ 182 /* s2" = (r0 - r2) + j*(i0 - i2) + (i1 - i3) - j*(r1 - r3) */ \ 183 /* s3" = (r0 - r2) + j*(i0 - i2) - (i1 - i3) + j*(r1 - r3) */ \ 184 /* xmm0 = r0 r1 r2 r3 ... */ \ 185 /* xmm1 = i0 i1 i2 i3 ... */ \ 186 /* xmm4 = r4 r5 r6 r7 ... */ \ 187 /* xmm5 = i4 i5 i6 i7 ... */ \ 188 __ASM_EMIT("vshufps $0xd8, %%xmm0, %%xmm0, %%xmm0") /* xmm0 = r0 r2 r1 r3 */ \ 189 __ASM_EMIT("vshufps $0xd8, %%xmm1, %%xmm1, %%xmm1") /* xmm1 = i0 i2 i1 i3 */ \ 190 __ASM_EMIT("vshufps $0xd8, %%xmm4, %%xmm4, %%xmm4") \ 191 __ASM_EMIT("vshufps $0xd8, %%xmm5, %%xmm5, %%xmm5") \ 192 __ASM_EMIT("vhsubps %%xmm1, %%xmm0, %%xmm2") /* xmm2 = r0-r2 r1-r3 i0-i2 i1-i3 = r1' r3' i1' i3' */ \ 193 __ASM_EMIT("vhsubps %%xmm5, %%xmm4, %%xmm3") \ 194 __ASM_EMIT("vhaddps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = r0+r2 r1+r3 i0+i2 i1+i3 = r0' r2' i0' i2' */ \ 195 __ASM_EMIT("vhaddps %%xmm5, %%xmm4, %%xmm4") \ 196 /* 1st-order butterflies */ \ 197 __ASM_EMIT("vshufps $0x6e, %%xmm2, %%xmm0, %%xmm1") /* xmm0 = i0' i2' i1' r3' */ \ 198 __ASM_EMIT("vshufps $0x6e, %%xmm3, %%xmm4, %%xmm5") \ 199 __ASM_EMIT("vshufps $0xc4, %%xmm2, %%xmm0, %%xmm0") /* xmm1 = r0' r2' r1' i3' */ \ 200 __ASM_EMIT("vshufps $0xc4, %%xmm3, %%xmm4, %%xmm4") \ 201 __ASM_EMIT("vmovups %%xmm0, 0x00(%[dst])") \ 202 __ASM_EMIT("vmovups %%xmm1, 0x10(%[dst])") \ 203 __ASM_EMIT("vmovups %%xmm4, 0x20(%[dst])") \ 204 __ASM_EMIT("vmovups %%xmm5, 0x30(%[dst])") \ 205 __ASM_EMIT("vhsubps %%xmm1, %%xmm0, %%xmm2") /* xmm2 = r0'-r2' r1'-i3' i0'-i2' i1'-r3' = r1" r3" i1" i2" */ \ 206 __ASM_EMIT("vhsubps %%xmm5, %%xmm4, %%xmm3") \ 207 __ASM_EMIT("vhaddps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = r0'+r2' r1'+i3' i0'+i2' i1'+r3' = r0" r2" i0" i3" */ \ 208 __ASM_EMIT("vhaddps %%xmm5, %%xmm4, %%xmm4") \ 209 __ASM_EMIT("vblendps $0x88, %%xmm0, %%xmm2, %%xmm1") /* xmm1 = r1" r3" i1" i3" */ \ 210 __ASM_EMIT("vblendps $0x88, %%xmm4, %%xmm3, %%xmm5") \ 211 __ASM_EMIT("vblendps $0x88, %%xmm2, %%xmm0, %%xmm0") /* xmm0 = r0" r2" i0" i2" */ \ 212 __ASM_EMIT("vblendps $0x88, %%xmm3, %%xmm4, %%xmm4") \ 213 __ASM_EMIT("vunpckhps %%xmm1, %%xmm0, %%xmm3") /* xmm3 = i0" i1" i2" i3" */ \ 214 __ASM_EMIT("vunpcklps %%xmm1, %%xmm0, %%xmm2") /* xmm2 = r0" r1" r2" r3" */ \ 215 __ASM_EMIT("vunpckhps %%xmm5, %%xmm4, %%xmm1") \ 216 __ASM_EMIT("vunpcklps %%xmm5, %%xmm4, %%xmm0") \ 217 /* Store */ \ 218 __ASM_EMIT("vmovups %%xmm2, 0x00(%[dst])") \ 219 __ASM_EMIT("vmovups %%xmm0, 0x10(%[dst])") \ 220 __ASM_EMIT("vmovups %%xmm3, 0x20(%[dst])") \ 221 __ASM_EMIT("vmovups %%xmm1, 0x30(%[dst])") \ 222 __ASM_EMIT("4:") \ 223 \ 224 : [dst] "+r" (dst), [nb] "+r" (nb) \ 225 : [FFT_A] "o" (FFT_A) \ 226 : "cc", "memory", \ 227 "%xmm0", "%xmm1", "%xmm2", "%xmm3", \ 228 "%xmm4", "%xmm5", "%xmm6", "%xmm7" \ 229 ); 230 231 #define FASTCONV_REVERSE_BUTTERFLY_BODY8(add_re, add_im, FMA_SEL) \ 232 ARCH_X86_ASM \ 233 ( \ 234 /* Prepare angle */ \ 235 __ASM_EMIT("vmovaps 0x00(%[ak]), %%ymm6") /* ymm6 = x_re */ \ 236 __ASM_EMIT("vmovaps 0x20(%[ak]), %%ymm7") /* ymm7 = x_im */ \ 237 /* Start loop */ \ 238 __ASM_EMIT("1:") \ 239 __ASM_EMIT("vmovups 0x00(%[dst], %[off1]), %%ymm0") /* ymm0 = a_re */ \ 240 __ASM_EMIT("vmovups 0x20(%[dst], %[off1]), %%ymm1") /* ymm1 = a_im */ \ 241 __ASM_EMIT("vmovups 0x00(%[dst], %[off2]), %%ymm2") /* ymm2 = b_re */ \ 242 __ASM_EMIT("vmovups 0x20(%[dst], %[off2]), %%ymm3") /* ymm3 = b_im */ \ 243 /* Calculate complex multiplication */ \ 244 __ASM_EMIT("vmulps %%ymm7, %%ymm2, %%ymm4") /* ymm4 = x_im * b_re */ \ 245 __ASM_EMIT("vmulps %%ymm7, %%ymm3, %%ymm5") /* ymm5 = x_im * b_im */ \ 246 __ASM_EMIT(FMA_SEL("vmulps %%ymm6, %%ymm2, %%ymm2", "")) /* ymm2 = x_re * b_re */ \ 247 __ASM_EMIT(FMA_SEL("vmulps %%ymm6, %%ymm3, %%ymm3", "")) /* ymm3 = x_re * b_im */ \ 248 __ASM_EMIT(FMA_SEL(add_re " %%ymm5, %%ymm2, %%ymm5", add_re " %%ymm6, %%ymm2, %%ymm5")) /* ymm5 = c_re = x_re * b_re +- x_im * b_im */ \ 249 __ASM_EMIT(FMA_SEL(add_im " %%ymm4, %%ymm3, %%ymm4", add_im " %%ymm6, %%ymm3, %%ymm4")) /* ymm4 = c_im = x_re * b_im -+ x_im * b_re */ \ 250 /* Perform butterfly */ \ 251 __ASM_EMIT("vsubps %%ymm5, %%ymm0, %%ymm2") /* ymm2 = a_re - c_re */ \ 252 __ASM_EMIT("vsubps %%ymm4, %%ymm1, %%ymm3") /* ymm3 = a_im - c_im */ \ 253 __ASM_EMIT("vaddps %%ymm5, %%ymm0, %%ymm0") /* ymm0 = a_re + c_re */ \ 254 __ASM_EMIT("vaddps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = a_im + c_im */ \ 255 /* Store values */ \ 256 __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst], %[off1])") \ 257 __ASM_EMIT("vmovups %%ymm1, 0x20(%[dst], %[off1])") \ 258 __ASM_EMIT("vmovups %%ymm2, 0x00(%[dst], %[off2])") \ 259 __ASM_EMIT("vmovups %%ymm3, 0x20(%[dst], %[off2])") \ 260 __ASM_EMIT("add $0x40, %[off1]") \ 261 __ASM_EMIT("add $0x40, %[off2]") \ 262 __ASM_EMIT32("subl $8, %[np]") \ 263 __ASM_EMIT64("subq $8, %[np]") \ 264 __ASM_EMIT("jz 2f") \ 265 /* Rotate angle */ \ 266 __ASM_EMIT("vmovaps 0x00(%[wk]), %%ymm4") /* xmm4 = w_re */ \ 267 __ASM_EMIT("vmovaps 0x20(%[wk]), %%ymm5") /* xmm5 = w_im */ \ 268 __ASM_EMIT("vmulps %%ymm5, %%ymm6, %%ymm2") /* ymm2 = w_im * x_re */ \ 269 __ASM_EMIT("vmulps %%ymm5, %%ymm7, %%ymm3") /* ymm3 = w_im * x_im */ \ 270 __ASM_EMIT(FMA_SEL("vmulps %%ymm4, %%ymm6, %%ymm6", "")) /* ymm6 = w_re * x_re */ \ 271 __ASM_EMIT(FMA_SEL("vmulps %%ymm4, %%ymm7, %%ymm7", "")) /* ymm7 = w_re * x_im */ \ 272 __ASM_EMIT(FMA_SEL("vsubps %%ymm3, %%ymm6, %%ymm6", "vfmsub132ps %%ymm4, %%ymm3, %%ymm6")) /* ymm6 = x_re' = w_re * x_re - w_im * x_im */ \ 273 __ASM_EMIT(FMA_SEL("vaddps %%ymm2, %%ymm7, %%ymm7", "vfmadd132ps %%ymm4, %%ymm2, %%ymm7")) /* ymm7 = x_im' = w_re * x_im + w_im * x_re */ \ 274 /* Repeat loop */ \ 275 __ASM_EMIT("jmp 1b") \ 276 __ASM_EMIT("2:") \ 277 \ 278 : [off1] "+r" (off1), [off2] "+r" (off2), [np] __ASM_ARG_RW(np) \ 279 : [dst] "r" (dst), [ak] "r" (ak), [wk] "r" (wk) \ 280 : "cc", "memory", \ 281 "%xmm0", "%xmm1", "%xmm2", "%xmm3", \ 282 "%xmm4", "%xmm5", "%xmm6", "%xmm7" \ 283 ); 284 285 #define FASTCONV_REVERSE_BUTTERFLY_BODY_LAST(add_re, add_im, FMA_SEL, IF_ADD) \ 286 size_t off; \ 287 float norm = 0.5f / np; \ 288 ARCH_X86_ASM \ 289 ( \ 290 /* Prepare angle */ \ 291 __ASM_EMIT("vbroadcastss %[norm], %%ymm1") /* ymm1 = k */ \ 292 __ASM_EMIT("lea (,%[np], 4), %[off]") /* off = np * 8 */ \ 293 __ASM_EMIT("vmovaps 0x00(%[ak]), %%ymm6") /* ymm6 = x_re */ \ 294 __ASM_EMIT("vmovaps 0x20(%[ak]), %%ymm7") /* ymm7 = x_im */ \ 295 __ASM_EMIT("vmovaps 0x00(%[wk]), %%ymm4") /* xmm4 = w_re */ \ 296 __ASM_EMIT("vmovaps 0x20(%[wk]), %%ymm5") /* xmm5 = w_im */ \ 297 /* Start loop */ \ 298 __ASM_EMIT("1:") \ 299 __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") /* ymm0 = a_re */ \ 300 __ASM_EMIT("vmovups 0x00(%[src], %[off], 2), %%ymm2") /* ymm2 = b_re */ \ 301 __ASM_EMIT("vmovups 0x20(%[src], %[off], 2), %%ymm3") /* ymm3 = b_im */ \ 302 /* Calculate complex multiplication */ \ 303 __ASM_EMIT("vmulps %%ymm7, %%ymm3, %%ymm3") /* ymm3 = x_im * b_im */ \ 304 __ASM_EMIT(FMA_SEL("vmulps %%ymm6, %%ymm2, %%ymm2", "")) /* ymm2 = x_re * b_re */ \ 305 __ASM_EMIT(FMA_SEL(add_re " %%ymm3, %%ymm2, %%ymm3", add_re " %%ymm6, %%ymm2, %%ymm3")) /* ymm3 = c_re = x_re * b_re +- x_im * b_im */ \ 306 /* Perform butterfly */ \ 307 __ASM_EMIT("vsubps %%ymm3, %%ymm0, %%ymm2") /* ymm2 = a_re - c_re */ \ 308 __ASM_EMIT("vaddps %%ymm3, %%ymm0, %%ymm0") /* ymm0 = a_re + c_re */ \ 309 __ASM_EMIT("vmulps %%ymm1, %%ymm2, %%ymm2") \ 310 __ASM_EMIT("vmulps %%ymm1, %%ymm0, %%ymm0") \ 311 /* Store values */ \ 312 __ASM_EMIT(IF_ADD("vaddps 0x00(%[dst]), %%ymm0, %%ymm0")) \ 313 __ASM_EMIT(IF_ADD("vaddps 0x00(%[dst], %[off]), %%ymm2, %%ymm2")) \ 314 __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") \ 315 __ASM_EMIT("vmovups %%ymm2, 0x00(%[dst], %[off])") \ 316 __ASM_EMIT("add $0x40, %[src]") \ 317 __ASM_EMIT("add $0x20, %[dst]") \ 318 __ASM_EMIT32("subl $8, %[np]") \ 319 __ASM_EMIT64("subq $8, %[np]") \ 320 __ASM_EMIT("jbe 2f") \ 321 /* Rotate angle */ \ 322 __ASM_EMIT("vmulps %%ymm5, %%ymm6, %%ymm2") /* ymm2 = w_im * x_re */ \ 323 __ASM_EMIT("vmulps %%ymm5, %%ymm7, %%ymm3") /* ymm3 = w_im * x_im */ \ 324 __ASM_EMIT(FMA_SEL("vmulps %%ymm4, %%ymm6, %%ymm6", "")) /* ymm6 = w_re * x_re */ \ 325 __ASM_EMIT(FMA_SEL("vmulps %%ymm4, %%ymm7, %%ymm7", "")) /* ymm7 = w_re * x_im */ \ 326 __ASM_EMIT(FMA_SEL("vsubps %%ymm3, %%ymm6, %%ymm6", "vfmsub132ps %%ymm4, %%ymm3, %%ymm6")) /* ymm6 = x_re' = w_re * x_re - w_im * x_im */ \ 327 __ASM_EMIT(FMA_SEL("vaddps %%ymm2, %%ymm7, %%ymm7", "vfmadd132ps %%ymm4, %%ymm2, %%ymm7")) /* ymm7 = x_im' = w_re * x_im + w_im * x_re */ \ 328 /* Repeat loop */ \ 329 __ASM_EMIT("jmp 1b") \ 330 __ASM_EMIT("2:") \ 331 \ 332 : [off] "=&r" (off), [np] __ASM_ARG_RW(np) \ 333 : [dst] "r" (dst), [src] "r" (src), [ak] "r" (ak), [wk] "r" (wk), \ 334 [norm] "o" (norm) \ 335 : "cc", "memory", \ 336 "%xmm0", "%xmm1", "%xmm2", "%xmm3", \ 337 "%xmm4", "%xmm5", "%xmm6", "%xmm7" \ 338 ); 339 340 namespace avx 341 { 342 #define FMA_OFF(a, b) a 343 #define FMA_ON(a, b) b 344 #define FASTCONV_SET(x) 345 #define FASTCONV_ADD(x) x 346 fastconv_direct_butterfly(float * dst,const float * ak,const float * wk,size_t pairs,size_t nb)347 static inline void fastconv_direct_butterfly(float *dst, const float *ak, const float *wk, size_t pairs, size_t nb) 348 { 349 size_t off1, off2, np; 350 off1 = 0; 351 size_t step = pairs << 3; 352 for (size_t i=0; i<nb; ++i) 353 { 354 off2 = off1 + step; 355 np = pairs; 356 357 FASTCONV_DIRECT_BUTTERFLY_BODY8("vaddps", "vsubps", FMA_OFF); 358 359 off1 = off2; 360 } 361 } 362 fastconv_direct_butterfly_last(float * dst,size_t nb)363 static inline void fastconv_direct_butterfly_last(float *dst, size_t nb) 364 { 365 FASTCONV_DIRECT_BUTTERFLY_LAST("vaddps", "vsubps", FMA_OFF); 366 } 367 fastconv_direct_butterfly_fma3(float * dst,const float * ak,const float * wk,size_t pairs,size_t nb)368 static inline void fastconv_direct_butterfly_fma3(float *dst, const float *ak, const float *wk, size_t pairs, size_t nb) 369 { 370 size_t off1, off2, np; 371 off1 = 0; 372 size_t step = pairs << 3; 373 for (size_t i=0; i<nb; ++i) 374 { 375 off2 = off1 + step; 376 np = pairs; 377 378 FASTCONV_DIRECT_BUTTERFLY_BODY8("vfmadd132ps", "vfmsub132ps", FMA_ON); 379 380 off1 = off2; 381 } 382 } 383 fastconv_direct_butterfly_last_fma3(float * dst,size_t nb)384 static inline void fastconv_direct_butterfly_last_fma3(float *dst, size_t nb) 385 { 386 FASTCONV_DIRECT_BUTTERFLY_LAST("vfmadd132ps", "vfmsub132ps", FMA_ON); 387 } 388 fastconv_reverse_butterfly(float * dst,const float * ak,const float * wk,size_t pairs,size_t nb)389 static inline void fastconv_reverse_butterfly(float *dst, const float *ak, const float *wk, size_t pairs, size_t nb) 390 { 391 size_t off1, off2, np; 392 off1 = 0; 393 size_t step = pairs << 3; 394 for (size_t i=0; i<nb; ++i) 395 { 396 off2 = off1 + step; 397 np = pairs; 398 399 FASTCONV_REVERSE_BUTTERFLY_BODY8("vsubps", "vaddps", FMA_OFF); 400 401 off1 = off2; 402 } 403 } 404 fastconv_reverse_butterfly_last(float * dst,const float * src,const float * ak,const float * wk,size_t np)405 static inline void fastconv_reverse_butterfly_last(float *dst, const float *src, const float *ak, const float *wk, size_t np) 406 { 407 FASTCONV_REVERSE_BUTTERFLY_BODY_LAST("vsubps", "vaddps", FMA_OFF, FASTCONV_SET); 408 } 409 fastconv_reverse_butterfly_last_adding(float * dst,const float * src,const float * ak,const float * wk,size_t np)410 static inline void fastconv_reverse_butterfly_last_adding(float *dst, const float *src, const float *ak, const float *wk, size_t np) 411 { 412 FASTCONV_REVERSE_BUTTERFLY_BODY_LAST("vsubps", "vaddps", FMA_OFF, FASTCONV_ADD); 413 } 414 fastconv_reverse_butterfly_fma3(float * dst,const float * ak,const float * wk,size_t pairs,size_t nb)415 static inline void fastconv_reverse_butterfly_fma3(float *dst, const float *ak, const float *wk, size_t pairs, size_t nb) 416 { 417 size_t off1, off2, np; 418 off1 = 0; 419 size_t step = pairs << 3; 420 for (size_t i=0; i<nb; ++i) 421 { 422 off2 = off1 + step; 423 np = pairs; 424 425 FASTCONV_REVERSE_BUTTERFLY_BODY8("vfmsub231ps", "vfmadd231ps", FMA_ON); 426 427 off1 = off2; 428 } 429 } 430 fastconv_reverse_butterfly_last_fma3(float * dst,const float * src,const float * ak,const float * wk,size_t np)431 static inline void fastconv_reverse_butterfly_last_fma3(float *dst, const float *src, const float *ak, const float *wk, size_t np) 432 { 433 FASTCONV_REVERSE_BUTTERFLY_BODY_LAST("vfmsub231ps", "vfmadd231ps", FMA_ON, FASTCONV_SET); 434 } 435 fastconv_reverse_butterfly_last_adding_fma3(float * dst,const float * src,const float * ak,const float * wk,size_t np)436 static inline void fastconv_reverse_butterfly_last_adding_fma3(float *dst, const float *src, const float *ak, const float *wk, size_t np) 437 { 438 FASTCONV_REVERSE_BUTTERFLY_BODY_LAST("vfmsub231ps", "vfmadd231ps", FMA_ON, FASTCONV_ADD); 439 } 440 441 #undef FASTCONV_DIRECT_BUTTERFLY_BODY8 442 #undef FASTCONV_DIRECT_BUTTERFLY_LAST 443 #undef FASTCONV_REVERSE_BUTTERFLY_BODY8 444 #undef FASTCONV_REVERSE_BUTTERFLY_LAST 445 #undef FASTCONV_SET 446 #undef FASTCONV_ADD 447 #undef FMA_OFF 448 #undef FMA_ON 449 } 450