1 /* 2 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/> 3 * (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com> 4 * 5 * This file is part of lsp-plugins 6 * Created on: 6 мар. 2017 г. 7 * 8 * lsp-plugins is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Lesser General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * any later version. 12 * 13 * lsp-plugins is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public License 19 * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>. 20 */ 21 #ifndef DSP_ARCH_X86_SSE_FASTCONV_H_IMPL 22 #error "This header should not be included directly" 23 #endif /* DSP_ARCH_X86_SSE_FASTCONV_H_IMPL */ 24 25 namespace sse 26 { fastconv_restore_internal(float * dst,float * tmp,size_t rank)27 static inline void fastconv_restore_internal(float *dst, float *tmp, size_t rank) 28 { 29 // Prepare for butterflies 30 size_t last = size_t(1) << rank; 31 size_t items = last << 1; 32 size_t n = 8; 33 size_t bs = n << 1; 34 35 const float *wk = XFFT_W; 36 const float *ak = XFFT_A; 37 38 // Iterate butterflies 39 while (n < last) 40 { 41 for (size_t p=0; p<items; p += bs) 42 { 43 // Set initial values of pointers 44 float *a = &tmp[p]; 45 float *b = &a[n]; 46 size_t k = n; 47 48 ARCH_X86_ASM 49 ( 50 __ASM_EMIT("movaps 0x00(%[ak]), %%xmm6") /* xmm6 = rak[i] */ 51 __ASM_EMIT("movaps 0x10(%[ak]), %%xmm7") /* xmm7 = iak[i] */ 52 : 53 : [ak] "r"(ak) 54 : "%xmm6", "%xmm7" 55 ); 56 57 ARCH_X86_ASM 58 ( 59 // __ASM_EMIT(".align 16") 60 __ASM_EMIT("1:") 61 62 __ASM_EMIT("movups 0x00(%[a]), %%xmm0") /* xmm0 = ra[i] */ 63 __ASM_EMIT("movups 0x10(%[a]), %%xmm1") /* xmm1 = ia[i] */ 64 __ASM_EMIT("movups 0x00(%[b]), %%xmm2") /* xmm2 = rb[i] */ 65 __ASM_EMIT("movups 0x10(%[b]), %%xmm3") /* xmm3 = ib[i] */ 66 67 __ASM_EMIT("movaps %%xmm2, %%xmm4") /* xmm4 = rb[i] */ 68 __ASM_EMIT("movaps %%xmm3, %%xmm5") /* xmm5 = ib[i] */ 69 __ASM_EMIT("mulps %%xmm6, %%xmm2") /* xmm2 = rak[i]*rb[i] */ 70 __ASM_EMIT("mulps %%xmm7, %%xmm4") /* xmm4 = iak[i]*rb[i] */ 71 __ASM_EMIT("mulps %%xmm6, %%xmm3") /* xmm3 = rak[i]*ib[i] */ 72 __ASM_EMIT("mulps %%xmm7, %%xmm5") /* xmm5 = iak[i]*ib[i] */ 73 __ASM_EMIT("addps %%xmm4, %%xmm3") /* xmm3 = rak[i]*ib[i] + iak[i]*rb[i] = ic[i] */ 74 __ASM_EMIT("subps %%xmm5, %%xmm2") /* xmm2 = rak[i]*rb[i] - iak[i]*ib[i] = rc[i] */ 75 __ASM_EMIT("movaps %%xmm0, %%xmm4") /* xmm4 = ra[i] */ 76 __ASM_EMIT("movaps %%xmm1, %%xmm5") /* xmm5 = ia[i] */ 77 __ASM_EMIT("subps %%xmm2, %%xmm0") /* xmm0 = ra[i] - rc[i] */ 78 __ASM_EMIT("subps %%xmm3, %%xmm1") /* xmm1 = ia[i] - ic[i] */ 79 __ASM_EMIT("addps %%xmm4, %%xmm2") /* xmm2 = ra[i] + rc[i] */ 80 __ASM_EMIT("addps %%xmm5, %%xmm3") /* xmm3 = ia[i] + ic[i] */ 81 82 __ASM_EMIT("movups %%xmm2, 0x00(%[a])") 83 __ASM_EMIT("movups %%xmm3, 0x10(%[a])") 84 __ASM_EMIT("movups %%xmm0, 0x00(%[b])") 85 __ASM_EMIT("movups %%xmm1, 0x10(%[b])") 86 87 __ASM_EMIT("add $0x20, %[a]") 88 __ASM_EMIT("add $0x20, %[b]") 89 __ASM_EMIT("sub $8, %[k]") 90 __ASM_EMIT("jz 2f") 91 92 /* Rotate angle */ 93 __ASM_EMIT("movaps 0x00(%[wk]), %%xmm0") /* xmm0 = rw */ 94 __ASM_EMIT("movaps 0x10(%[wk]), %%xmm1") /* xmm1 = iw */ 95 __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = rw */ 96 __ASM_EMIT("movaps %%xmm1, %%xmm3") /* xmm3 = iw */ 97 __ASM_EMIT("mulps %%xmm6, %%xmm3") /* xmm3 = ra * iw */ 98 __ASM_EMIT("mulps %%xmm7, %%xmm1") /* xmm1 = ia * iw */ 99 __ASM_EMIT("mulps %%xmm0, %%xmm6") /* xmm6 = ra * rw */ 100 __ASM_EMIT("mulps %%xmm2, %%xmm7") /* xmm7 = ia * rw */ 101 __ASM_EMIT("subps %%xmm1, %%xmm6") /* xmm6 = ra * rw - ia * iw */ 102 __ASM_EMIT("addps %%xmm3, %%xmm7") /* xmm7 = ra * iw + ia * rw */ 103 __ASM_EMIT("jmp 1b") 104 105 __ASM_EMIT("2:") 106 107 : [a] "+r"(a), [b] "+r"(b), [k] "+r" (k) 108 : [wk] "r"(wk) 109 : "cc", "memory", 110 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 111 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 112 ); 113 } 114 115 wk += 8; 116 ak += 8; 117 n <<= 1; 118 bs <<= 1; 119 } 120 121 if (n < items) 122 { 123 // ONE LARGE CYCLE 124 // Set initial values of pointers 125 float kn = 1.0f / last; 126 size_t k = n; 127 128 ARCH_X86_ASM 129 ( 130 __ASM_EMIT("movaps 0x00(%[ak]), %%xmm6") /* xmm6 = rak[i] */ 131 __ASM_EMIT("movaps 0x10(%[ak]), %%xmm7") /* xmm7 = iak[i] */ 132 __ASM_EMIT("shufps $0x00, %%xmm0, %%xmm0") /* xmm0 = kn */ 133 __ASM_EMIT("movaps %%xmm0, %%xmm1") /* xmm1 = kn */ 134 : "+Yz" (kn) 135 : [ak] "r" (ak) 136 : "%xmm1", "%xmm6", "%xmm7" 137 ); 138 139 ARCH_X86_ASM 140 ( 141 // __ASM_EMIT(".align 16") 142 __ASM_EMIT("1:") 143 144 __ASM_EMIT("movups 0x00(%[tmp]), %%xmm4") /* xmm4 = ra[i] */ 145 __ASM_EMIT("movups 0x00(%[tmp],%[n],4), %%xmm2") /* xmm2 = rb[i] */ 146 __ASM_EMIT("movups 0x10(%[tmp],%[n],4), %%xmm3") /* xmm3 = ib[i] */ 147 148 __ASM_EMIT("mulps %%xmm6, %%xmm2") /* xmm2 = rak[i]*rb[i] */ 149 __ASM_EMIT("mulps %%xmm7, %%xmm3") /* xmm3 = iak[i]*ib[i] */ 150 __ASM_EMIT("movaps %%xmm4, %%xmm5") /* xmm5 = ra[i] */ 151 __ASM_EMIT("subps %%xmm3, %%xmm2") /* xmm2 = rak[i]*rb[i] - iak[i]*ib[i] = rc[i] */ 152 __ASM_EMIT("addps %%xmm2, %%xmm4") /* xmm4 = ra[i] + rc[i] */ 153 __ASM_EMIT("subps %%xmm2, %%xmm5") /* xmm5 = ra[i] - rc[i] */ 154 __ASM_EMIT("mulps %%xmm0, %%xmm4") /* xmm4 = kn*(ra[i] + rc[i]) */ 155 __ASM_EMIT("mulps %%xmm1, %%xmm5") /* xmm5 = kn*(ra[i] - rc[i]) */ 156 157 __ASM_EMIT("movups 0x00(%[dst]), %%xmm2") 158 __ASM_EMIT("movups 0x00(%[dst], %[n], 2), %%xmm3") 159 __ASM_EMIT("addps %%xmm4, %%xmm2") 160 __ASM_EMIT("addps %%xmm5, %%xmm3") 161 __ASM_EMIT("movups %%xmm2, 0x00(%[dst])") 162 __ASM_EMIT("movups %%xmm3, 0x00(%[dst],%[n],2)") 163 164 __ASM_EMIT("add $0x20, %[tmp]") 165 __ASM_EMIT("add $0x10, %[dst]") 166 __ASM_EMIT("sub $8, %[k]") 167 __ASM_EMIT("jz 2f") 168 169 /* Rotate angle */ 170 __ASM_EMIT("movaps 0x00(%[wk]), %%xmm4") /* xmm4 = rw */ 171 __ASM_EMIT("movaps 0x10(%[wk]), %%xmm5") /* xmm5 = iw */ 172 __ASM_EMIT("movaps %%xmm4, %%xmm2") /* xmm2 = rw */ 173 __ASM_EMIT("movaps %%xmm5, %%xmm3") /* xmm3 = iw */ 174 __ASM_EMIT("mulps %%xmm6, %%xmm3") /* xmm3 = ra * iw */ 175 __ASM_EMIT("mulps %%xmm7, %%xmm5") /* xmm5 = ia * iw */ 176 __ASM_EMIT("mulps %%xmm4, %%xmm6") /* xmm6 = ra * rw */ 177 __ASM_EMIT("mulps %%xmm2, %%xmm7") /* xmm7 = ia * rw */ 178 __ASM_EMIT("subps %%xmm5, %%xmm6") /* xmm6 = ra * rw - ia * iw */ 179 __ASM_EMIT("addps %%xmm3, %%xmm7") /* xmm7 = ra * iw + ia * rw */ 180 __ASM_EMIT("jmp 1b") 181 182 __ASM_EMIT("2:") 183 184 : [tmp] "+r"(tmp), [dst] "+r"(dst), [n] "+r"(n), [k] "+r" (k) 185 : [wk] "r"(wk) 186 : "cc", "memory", 187 "%xmm0", "%xmm5", "%xmm2", "%xmm3", 188 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 189 ); 190 } 191 else 192 { 193 // Add real result to the target (ignore complex result) 194 float kn = 1.0f / last; 195 196 // Unpack 4x split complex to 4x real 197 ARCH_X86_ASM 198 ( 199 __ASM_EMIT("shufps $0x00, %%xmm0, %%xmm0") /* xmm0 = kn */ 200 __ASM_EMIT("movups 0x00(%[tmp]), %%xmm1") /* xmm0 = s[i] */ 201 __ASM_EMIT("movups 0x00(%[dst]), %%xmm2") 202 __ASM_EMIT("mulps %%xmm0, %%xmm1") 203 __ASM_EMIT("addps %%xmm1, %%xmm2") 204 __ASM_EMIT("movups %%xmm2, 0x00(%[dst])") 205 : "+Yz" (kn) 206 : [tmp] "r" (tmp), [dst] "r" (dst) 207 : "cc", "memory", 208 "%xmm1", "%xmm2" 209 ); 210 } 211 } 212 } 213