1 /* 2 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/> 3 * (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com> 4 * 5 * This file is part of lsp-plugins 6 * Created on: 6 мар. 2017 г. 7 * 8 * lsp-plugins is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Lesser General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * any later version. 12 * 13 * lsp-plugins is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public License 19 * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>. 20 */ 21 #ifndef DSP_ARCH_X86_SSE_FASTCONV_H_IMPL 22 #error "This header should not be included directly" 23 #endif /* DSP_ARCH_X86_SSE_FASTCONV_H_IMPL */ 24 25 namespace sse 26 { fastconv_restore(float * dst,float * tmp,size_t rank)27 void fastconv_restore(float *dst, float *tmp, size_t rank) 28 { 29 // Prepare for butterflies 30 size_t last = size_t(1) << rank; 31 size_t items = last; 32 float *ptr = tmp; 33 34 ARCH_X86_ASM 35 ( 36 // __ASM_EMIT(".align 16") 37 __ASM_EMIT("1:") 38 39 // Load data 40 __ASM_EMIT("movups 0x00(%[ptr]), %%xmm1") 41 __ASM_EMIT("movups 0x10(%[ptr]), %%xmm3") 42 __ASM_EMIT("movups 0x20(%[ptr]), %%xmm5") 43 __ASM_EMIT("movups 0x30(%[ptr]), %%xmm7") 44 45 // Do x4 reverse butterflies 46 // xmm1 = r0 r1 r2 r3 47 // xmm3 = i0 i1 i2 i3 48 // xmm5 = r4 r5 r6 r7 49 // xmm7 = i4 i5 i6 i7 50 __ASM_EMIT("movaps %%xmm1, %%xmm0") /* xmm0 = r0 r1 r2 r3 */ 51 __ASM_EMIT("movaps %%xmm5, %%xmm4") 52 __ASM_EMIT("shufps $0x88, %%xmm3, %%xmm0") /* xmm0 = r0 r2 i0 i2 */ 53 __ASM_EMIT("shufps $0x88, %%xmm7, %%xmm4") 54 __ASM_EMIT("shufps $0xdd, %%xmm3, %%xmm1") /* xmm1 = r1 r3 i1 i3 */ 55 __ASM_EMIT("shufps $0xdd, %%xmm7, %%xmm5") 56 __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = r0 r2 i0 i2 */ 57 __ASM_EMIT("movaps %%xmm4, %%xmm6") 58 __ASM_EMIT("addps %%xmm1, %%xmm0") /* xmm0 = r0+r1 r2+r3 i0+i1 i2+i3 = r0k r2k i0k i2k */ 59 __ASM_EMIT("addps %%xmm5, %%xmm4") 60 __ASM_EMIT("subps %%xmm1, %%xmm2") /* xmm2 = r0-r1 r2-r3 i0-i1 i2-i3 = r1k r3k i1k i3k */ 61 __ASM_EMIT("subps %%xmm5, %%xmm6") 62 63 __ASM_EMIT("movaps %%xmm0, %%xmm1") /* xmm1 = r0k r2k i0k i2k */ 64 __ASM_EMIT("movaps %%xmm4, %%xmm5") 65 __ASM_EMIT("shufps $0x88, %%xmm2, %%xmm0") /* xmm0 = r0k i0k r1k i1k */ 66 __ASM_EMIT("shufps $0x88, %%xmm6, %%xmm4") 67 __ASM_EMIT("shufps $0x7d, %%xmm2, %%xmm1") /* xmm1 = r2k i2k i3k r3k */ 68 __ASM_EMIT("shufps $0x7d, %%xmm6, %%xmm5") 69 __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = r0k i0k r1k i1k */ 70 __ASM_EMIT("movaps %%xmm4, %%xmm6") 71 __ASM_EMIT("addps %%xmm1, %%xmm0") /* xmm0 = r0k+r2k i0k+i2k r1k+i3k i1k+r3k = d0 d4 d3 d5 */ 72 __ASM_EMIT("addps %%xmm5, %%xmm4") 73 __ASM_EMIT("subps %%xmm1, %%xmm2") /* xmm2 = r0k-r2k i0k-i2k r1k-i3k i1k-r3k = d2 d6 d1 d7 */ 74 __ASM_EMIT("subps %%xmm5, %%xmm6") 75 76 __ASM_EMIT("movaps %%xmm0, %%xmm1") /* xmm1 = d0 d4 d3 d5 */ 77 __ASM_EMIT("movaps %%xmm4, %%xmm5") 78 __ASM_EMIT("shufps $0x88, %%xmm2, %%xmm0") /* xmm0 = d0 d3 d2 d1 */ 79 __ASM_EMIT("shufps $0x88, %%xmm6, %%xmm4") 80 __ASM_EMIT("shufps $0xdd, %%xmm2, %%xmm1") /* xmm1 = d4 d5 d6 d7 */ 81 __ASM_EMIT("shufps $0xdd, %%xmm6, %%xmm5") 82 __ASM_EMIT("shufps $0x6c, %%xmm0, %%xmm0") /* xmm0 = d0 d1 d2 d3 */ 83 __ASM_EMIT("shufps $0x6c, %%xmm4, %%xmm4") 84 85 // Finally store result 86 __ASM_EMIT("movups %%xmm0, 0x00(%[ptr])") 87 __ASM_EMIT("movups %%xmm1, 0x10(%[ptr])") 88 __ASM_EMIT("movups %%xmm4, 0x20(%[ptr])") 89 __ASM_EMIT("movups %%xmm5, 0x30(%[ptr])") 90 91 __ASM_EMIT("add $0x40, %[ptr]") 92 __ASM_EMIT("sub $8, %[k]") 93 __ASM_EMIT("jnz 1b") 94 95 : [ptr] "+r" (ptr), [k] "+r" (items) 96 : 97 : "cc", "memory", 98 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 99 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 100 ); 101 102 items = last << 1; 103 size_t n = 8; 104 size_t bs = n << 1; 105 106 const float *wk = XFFT_W; 107 const float *ak = XFFT_A; 108 109 // Iterate butterflies 110 while (n < last) 111 { 112 for (size_t p=0; p<items; p += bs) 113 { 114 // Set initial values of pointers 115 float *a = &tmp[p]; 116 float *b = &a[n]; 117 size_t k = n; 118 119 ARCH_X86_ASM 120 ( 121 __ASM_EMIT("movaps 0x00(%[ak]), %%xmm6") /* xmm6 = rak[i] */ 122 __ASM_EMIT("movaps 0x10(%[ak]), %%xmm7") /* xmm7 = iak[i] */ 123 : 124 : [ak] "r"(ak) 125 : "%xmm6", "%xmm7" 126 ); 127 128 ARCH_X86_ASM 129 ( 130 // __ASM_EMIT(".align 16") 131 __ASM_EMIT("1:") 132 133 __ASM_EMIT("movups 0x00(%[a]), %%xmm0") /* xmm0 = ra[i] */ 134 __ASM_EMIT("movups 0x10(%[a]), %%xmm1") /* xmm1 = ia[i] */ 135 __ASM_EMIT("movups 0x00(%[b]), %%xmm2") /* xmm2 = rb[i] */ 136 __ASM_EMIT("movups 0x10(%[b]), %%xmm3") /* xmm3 = ib[i] */ 137 138 __ASM_EMIT("movaps %%xmm2, %%xmm4") /* xmm4 = rb[i] */ 139 __ASM_EMIT("movaps %%xmm3, %%xmm5") /* xmm5 = ib[i] */ 140 __ASM_EMIT("mulps %%xmm6, %%xmm2") /* xmm2 = rak[i]*rb[i] */ 141 __ASM_EMIT("mulps %%xmm7, %%xmm4") /* xmm4 = iak[i]*rb[i] */ 142 __ASM_EMIT("mulps %%xmm6, %%xmm3") /* xmm3 = rak[i]*ib[i] */ 143 __ASM_EMIT("mulps %%xmm7, %%xmm5") /* xmm5 = iak[i]*ib[i] */ 144 __ASM_EMIT("addps %%xmm4, %%xmm3") /* xmm3 = rak[i]*ib[i] + iak[i]*rb[i] = ic[i] */ 145 __ASM_EMIT("subps %%xmm5, %%xmm2") /* xmm2 = rak[i]*rb[i] - iak[i]*ib[i] = rc[i] */ 146 __ASM_EMIT("movaps %%xmm0, %%xmm4") /* xmm4 = ra[i] */ 147 __ASM_EMIT("movaps %%xmm1, %%xmm5") /* xmm5 = ia[i] */ 148 __ASM_EMIT("subps %%xmm2, %%xmm0") /* xmm0 = ra[i] - rc[i] */ 149 __ASM_EMIT("subps %%xmm3, %%xmm1") /* xmm1 = ia[i] - ic[i] */ 150 __ASM_EMIT("addps %%xmm4, %%xmm2") /* xmm2 = ra[i] + rc[i] */ 151 __ASM_EMIT("addps %%xmm5, %%xmm3") /* xmm3 = ia[i] + ic[i] */ 152 153 __ASM_EMIT("movups %%xmm2, 0x00(%[a])") 154 __ASM_EMIT("movups %%xmm3, 0x10(%[a])") 155 __ASM_EMIT("movups %%xmm0, 0x00(%[b])") 156 __ASM_EMIT("movups %%xmm1, 0x10(%[b])") 157 158 __ASM_EMIT("add $0x20, %[a]") 159 __ASM_EMIT("add $0x20, %[b]") 160 __ASM_EMIT("sub $8, %[k]") 161 __ASM_EMIT("jz 2f") 162 163 /* Rotate angle */ 164 __ASM_EMIT("movaps 0x00(%[wk]), %%xmm0") /* xmm0 = rw */ 165 __ASM_EMIT("movaps 0x10(%[wk]), %%xmm1") /* xmm1 = iw */ 166 __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = rw */ 167 __ASM_EMIT("movaps %%xmm1, %%xmm3") /* xmm3 = iw */ 168 __ASM_EMIT("mulps %%xmm6, %%xmm3") /* xmm3 = ra * iw */ 169 __ASM_EMIT("mulps %%xmm7, %%xmm1") /* xmm1 = ia * iw */ 170 __ASM_EMIT("mulps %%xmm0, %%xmm6") /* xmm6 = ra * rw */ 171 __ASM_EMIT("mulps %%xmm2, %%xmm7") /* xmm7 = ia * rw */ 172 __ASM_EMIT("subps %%xmm1, %%xmm6") /* xmm6 = ra * rw - ia * iw */ 173 __ASM_EMIT("addps %%xmm3, %%xmm7") /* xmm7 = ra * iw + ia * rw */ 174 __ASM_EMIT("jmp 1b") 175 176 __ASM_EMIT("2:") 177 178 : [a] "+r"(a), [b] "+r"(b), [k] "+r" (k) 179 : [wk] "r"(wk) 180 : "cc", "memory", 181 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 182 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 183 ); 184 } 185 186 wk += 8; 187 ak += 8; 188 n <<= 1; 189 bs <<= 1; 190 } 191 192 if (n < items) 193 { 194 // ONE LARGE CYCLE 195 // Set initial values of pointers 196 float kn = 1.0f / last; 197 size_t k = n; 198 199 ARCH_X86_ASM 200 ( 201 __ASM_EMIT("movaps 0x00(%[ak]), %%xmm6") /* xmm6 = rak[i] */ 202 __ASM_EMIT("movaps 0x10(%[ak]), %%xmm7") /* xmm7 = iak[i] */ 203 __ASM_EMIT("shufps $0x00, %%xmm0, %%xmm0") /* xmm0 = kn */ 204 __ASM_EMIT("movaps %%xmm0, %%xmm1") /* xmm1 = kn */ 205 : "+Yz" (kn) 206 : [ak] "r" (ak) 207 : "%xmm1", "%xmm6", "%xmm7" 208 ); 209 210 ARCH_X86_ASM 211 ( 212 // __ASM_EMIT(".align 16") 213 __ASM_EMIT("1:") 214 215 __ASM_EMIT("movups 0x00(%[tmp]), %%xmm4") /* xmm4 = ra[i] */ 216 __ASM_EMIT("movups 0x00(%[tmp],%[n],4), %%xmm2") /* xmm2 = rb[i] */ 217 __ASM_EMIT("movups 0x10(%[tmp],%[n],4), %%xmm3") /* xmm3 = ib[i] */ 218 219 __ASM_EMIT("mulps %%xmm6, %%xmm2") /* xmm2 = rak[i]*rb[i] */ 220 __ASM_EMIT("mulps %%xmm7, %%xmm3") /* xmm3 = iak[i]*ib[i] */ 221 __ASM_EMIT("movaps %%xmm4, %%xmm5") /* xmm5 = ra[i] */ 222 __ASM_EMIT("subps %%xmm3, %%xmm2") /* xmm2 = rak[i]*rb[i] - iak[i]*ib[i] = rc[i] */ 223 __ASM_EMIT("addps %%xmm2, %%xmm4") /* xmm4 = ra[i] + rc[i] */ 224 __ASM_EMIT("subps %%xmm2, %%xmm5") /* xmm5 = ra[i] - rc[i] */ 225 __ASM_EMIT("mulps %%xmm0, %%xmm4") /* xmm4 = kn*(ra[i] + rc[i]) */ 226 __ASM_EMIT("mulps %%xmm1, %%xmm5") /* xmm5 = kn*(ra[i] - rc[i]) */ 227 228 __ASM_EMIT("movups %%xmm4, 0x00(%[dst])") 229 __ASM_EMIT("movups %%xmm5, 0x00(%[dst],%[n],2)") 230 231 __ASM_EMIT("add $0x20, %[tmp]") 232 __ASM_EMIT("add $0x10, %[dst]") 233 __ASM_EMIT("sub $8, %[k]") 234 __ASM_EMIT("jz 2f") 235 236 /* Rotate angle */ 237 __ASM_EMIT("movaps 0x00(%[wk]), %%xmm4") /* xmm4 = rw */ 238 __ASM_EMIT("movaps 0x10(%[wk]), %%xmm5") /* xmm5 = iw */ 239 __ASM_EMIT("movaps %%xmm4, %%xmm2") /* xmm2 = rw */ 240 __ASM_EMIT("movaps %%xmm5, %%xmm3") /* xmm3 = iw */ 241 __ASM_EMIT("mulps %%xmm6, %%xmm3") /* xmm3 = ra * iw */ 242 __ASM_EMIT("mulps %%xmm7, %%xmm5") /* xmm5 = ia * iw */ 243 __ASM_EMIT("mulps %%xmm4, %%xmm6") /* xmm6 = ra * rw */ 244 __ASM_EMIT("mulps %%xmm2, %%xmm7") /* xmm7 = ia * rw */ 245 __ASM_EMIT("subps %%xmm5, %%xmm6") /* xmm6 = ra * rw - ia * iw */ 246 __ASM_EMIT("addps %%xmm3, %%xmm7") /* xmm7 = ra * iw + ia * rw */ 247 __ASM_EMIT("jmp 1b") 248 249 __ASM_EMIT("2:") 250 251 : [tmp] "+r"(tmp), [dst] "+r"(dst), [n] "+r"(n), [k] "+r" (k) 252 : [wk] "r"(wk) 253 : "cc", "memory", 254 "%xmm0", "%xmm5", "%xmm2", "%xmm3", 255 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 256 ); 257 } 258 else 259 { 260 // Add real result to the target (ignore complex result) 261 float kn = 1.0f / last; 262 263 // Unpack 4x split complex to 4x real 264 ARCH_X86_ASM 265 ( 266 __ASM_EMIT("shufps $0x00, %%xmm0, %%xmm0") /* xmm0 = kn */ 267 __ASM_EMIT("movups 0x00(%[tmp]), %%xmm1") /* xmm0 = s[i] */ 268 __ASM_EMIT("movups 0x00(%[dst]), %%xmm2") 269 __ASM_EMIT("mulps %%xmm0, %%xmm1") 270 __ASM_EMIT("addps %%xmm1, %%xmm2") 271 __ASM_EMIT("movups %%xmm2, 0x00(%[dst])") 272 : "+Yz" (kn) 273 : [tmp] "r" (tmp), [dst] "r" (dst) 274 : "cc", "memory", 275 "%xmm1", "%xmm2" 276 ); 277 } 278 279 } // RESTORE_IMPL 280 } 281