1 /* 2 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/> 3 * (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com> 4 * 5 * This file is part of lsp-plugins 6 * Created on: 5 мар. 2017 г. 7 * 8 * lsp-plugins is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Lesser General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * any later version. 12 * 13 * lsp-plugins is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public License 19 * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>. 20 */ 21 #ifndef DSP_ARCH_X86_SSE_FASTCONV_H_IMPL 22 #error "This header should not be included directly" 23 #endif /* DSP_ARCH_X86_SSE_FASTCONV_H_IMPL */ 24 25 namespace sse 26 { fastconv_parse(float * dst,const float * src,size_t rank)27 void fastconv_parse(float *dst, const float *src, size_t rank) 28 { 29 // Prepare for butterflies 30 const float *wk = &XFFT_W[(rank-3) << 3]; 31 const float *ak = &XFFT_A[(rank-3) << 3]; 32 size_t items = size_t(1) << (rank + 1); 33 size_t bs = items; 34 size_t n = bs >> 1; 35 36 // Iterate first cycle 37 if (n > 4) 38 { 39 // ONE LARGE CYCLE 40 // Set initial values of pointers 41 float *a = dst; 42 float *b = &a[n]; 43 size_t k = n; 44 45 ARCH_X86_ASM 46 ( 47 __ASM_EMIT("movaps 0x00(%[ak]), %%xmm6") /* xmm6 = rA[i] */ 48 __ASM_EMIT("movaps 0x10(%[ak]), %%xmm7") /* xmm7 = iA[i] */ 49 __ASM_EMIT("xorps %%xmm5, %%xmm5") /* xmm5 = 0 */ 50 : 51 : [ak] "r"(ak) 52 : 53 "%xmm5", "%xmm6", "%xmm7" 54 ); 55 56 ARCH_X86_ASM 57 ( 58 // __ASM_EMIT(".align 16") 59 __ASM_EMIT("1:") 60 61 __ASM_EMIT("movups 0x00(%[src]), %%xmm0") /* xmm0 = s[i] */ 62 __ASM_EMIT("xorps %%xmm3, %%xmm3") /* xmm3 = 0 */ 63 __ASM_EMIT("movaps %%xmm0, %%xmm4") /* xmm4 = s[i] */ 64 __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = s[i] */ 65 __ASM_EMIT("mulps %%xmm7, %%xmm4") /* xmm4 = s[i]*iA[i] */ 66 __ASM_EMIT("mulps %%xmm6, %%xmm2") /* xmm2 = s[i]*rA[i] */ 67 __ASM_EMIT("subps %%xmm4, %%xmm3") /* xmm3 = -s[i]*iA[i] */ 68 69 __ASM_EMIT("movups %%xmm0, 0x00(%[a])") 70 __ASM_EMIT("movups %%xmm5, 0x10(%[a])") 71 __ASM_EMIT("movups %%xmm2, 0x00(%[b])") 72 __ASM_EMIT("movups %%xmm3, 0x10(%[b])") 73 74 __ASM_EMIT("add $0x10, %[src]") 75 __ASM_EMIT("add $0x20, %[a]") 76 __ASM_EMIT("add $0x20, %[b]") 77 78 /* Repeat loop */ 79 __ASM_EMIT("sub $8, %[k]") 80 __ASM_EMIT("jz 2f") 81 82 /* Rotate angle */ 83 __ASM_EMIT("movaps 0x00(%[wk]), %%xmm0") /* xmm0 = rw[i] */ 84 __ASM_EMIT("movaps 0x10(%[wk]), %%xmm1") /* xmm1 = iw[i] */ 85 __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = rw[i] */ 86 __ASM_EMIT("movaps %%xmm1, %%xmm3") /* xmm3 = iw[i] */ 87 __ASM_EMIT("mulps %%xmm6, %%xmm3") /* xmm3 = rA[i] * iw[i] */ 88 __ASM_EMIT("mulps %%xmm7, %%xmm1") /* xmm1 = iA[i] * iw[i] */ 89 __ASM_EMIT("mulps %%xmm0, %%xmm6") /* xmm6 = rA[i] * rw[i] */ 90 __ASM_EMIT("mulps %%xmm2, %%xmm7") /* xmm7 = iA[i] * rw[i] */ 91 __ASM_EMIT("subps %%xmm1, %%xmm6") /* xmm6 = rA[i] * rw[i] - iA[i] * iw[i] */ 92 __ASM_EMIT("addps %%xmm3, %%xmm7") /* xmm7 = rA[i] * iw[i] + iA[i] * rw[i] */ 93 __ASM_EMIT("jmp 1b") 94 95 __ASM_EMIT("2:") 96 97 : [a] "+r" (a), [b] "+r" (b), [src] "+r" (src), [k] "+r" (k) 98 : [wk] "r"(wk) 99 : "cc", "memory", 100 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 101 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 102 ); 103 104 wk -= 8; 105 ak -= 8; 106 n >>= 1; 107 bs >>= 1; 108 } 109 else 110 { 111 // Unpack 4x real to 4x split complex 112 ARCH_X86_ASM 113 ( 114 __ASM_EMIT("movups 0x00(%[src]), %%xmm0") /* xmm0 = s[i] */ 115 __ASM_EMIT("xorps %%xmm1, %%xmm1") /* xmm1 = 0 */ 116 __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") 117 __ASM_EMIT("movups %%xmm1, 0x10(%[dst])") 118 : 119 : [items] "r" (items), [src] "r" (src), [dst] "r" (dst) 120 : "cc", "memory", 121 "%xmm0", "%xmm1" 122 ); 123 } 124 125 // Iterate butterflies 126 for (; n > 4; n >>= 1, bs >>= 1) 127 { 128 for (size_t p=0; p<items; p += bs) 129 { 130 // Set initial values of pointers 131 float *a = &dst[p]; 132 float *b = &a[n]; 133 size_t k = n; 134 135 ARCH_X86_ASM 136 ( 137 __ASM_EMIT("movaps 0x00(%[ak]), %%xmm6") /* xmm6 = rA[i] */ 138 __ASM_EMIT("movaps 0x10(%[ak]), %%xmm7") /* xmm7 = iA[i] */ 139 : 140 : [ak] "r"(ak) 141 : "%xmm6", "%xmm7" 142 ); 143 144 ARCH_X86_ASM 145 ( 146 // __ASM_EMIT(".align 16") 147 __ASM_EMIT("1:") 148 149 __ASM_EMIT("movups 0x00(%[a]), %%xmm0") /* xmm0 = ra[i] */ 150 __ASM_EMIT("movups 0x10(%[a]), %%xmm1") /* xmm1 = ia[i] */ 151 __ASM_EMIT("movups 0x00(%[b]), %%xmm2") /* xmm2 = rb[i] */ 152 __ASM_EMIT("movups 0x10(%[b]), %%xmm3") /* xmm3 = ib[i] */ 153 154 __ASM_EMIT("movaps %%xmm0, %%xmm4") /* xmm4 = ra[i] */ 155 __ASM_EMIT("movaps %%xmm1, %%xmm5") /* xmm5 = ia[i] */ 156 __ASM_EMIT("addps %%xmm2, %%xmm0") /* xmm0 = ra[i]+rb[i] */ 157 __ASM_EMIT("addps %%xmm3, %%xmm1") /* xmm1 = ia[i]+ib[i] */ 158 __ASM_EMIT("subps %%xmm2, %%xmm4") /* xmm4 = ra[i]-rb[i] = rc[i] */ 159 __ASM_EMIT("subps %%xmm3, %%xmm5") /* xmm5 = ia[i]-ib[i] = ic[i] */ 160 __ASM_EMIT("movaps %%xmm4, %%xmm2") /* xmm2 = rc[i] */ 161 __ASM_EMIT("movaps %%xmm5, %%xmm3") /* xmm3 = ic[i] */ 162 __ASM_EMIT("mulps %%xmm6, %%xmm4") /* xmm4 = rA[i]*rc[i] */ 163 __ASM_EMIT("mulps %%xmm7, %%xmm5") /* xmm5 = iA[i]*ic[i] */ 164 __ASM_EMIT("mulps %%xmm6, %%xmm3") /* xmm3 = rA[i]*ic[i] */ 165 __ASM_EMIT("mulps %%xmm7, %%xmm2") /* xmm2 = iA[i]*rc[i] */ 166 __ASM_EMIT("addps %%xmm5, %%xmm4") /* xmm4 = rA[i]*rc[i] + iA[i]*ic[i] */ 167 __ASM_EMIT("subps %%xmm2, %%xmm3") /* xmm3 = rA[i]*ic[i] - iA[i]*rc[i] */ 168 169 __ASM_EMIT("movups %%xmm0, 0x00(%[a])") 170 __ASM_EMIT("movups %%xmm1, 0x10(%[a])") 171 __ASM_EMIT("movups %%xmm4, 0x00(%[b])") 172 __ASM_EMIT("movups %%xmm3, 0x10(%[b])") 173 174 /* Repeat loop */ 175 __ASM_EMIT("add $0x20, %[a]") 176 __ASM_EMIT("add $0x20, %[b]") 177 __ASM_EMIT("sub $8, %[k]") 178 __ASM_EMIT("jz 2f") 179 180 /* Rotate angle */ 181 __ASM_EMIT("movaps 0x00(%[wk]), %%xmm0") /* xmm0 = rw */ 182 __ASM_EMIT("movaps 0x10(%[wk]), %%xmm1") /* xmm1 = iw */ 183 __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = rw */ 184 __ASM_EMIT("movaps %%xmm1, %%xmm3") /* xmm3 = iw */ 185 __ASM_EMIT("mulps %%xmm6, %%xmm3") /* xmm3 = ra * iw */ 186 __ASM_EMIT("mulps %%xmm7, %%xmm1") /* xmm1 = ia * iw */ 187 __ASM_EMIT("mulps %%xmm0, %%xmm6") /* xmm6 = ra * rw */ 188 __ASM_EMIT("mulps %%xmm2, %%xmm7") /* xmm7 = ia * rw */ 189 __ASM_EMIT("subps %%xmm1, %%xmm6") /* xmm6 = ra * rw - ia * iw */ 190 __ASM_EMIT("addps %%xmm3, %%xmm7") /* xmm7 = ra * iw + ia * rw */ 191 __ASM_EMIT("jmp 1b") 192 193 __ASM_EMIT("2:") 194 195 : [a] "+r" (a), [b] "+r" (b), [k] "+r" (k) 196 : [wk] "r"(wk) 197 : "cc", "memory", 198 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 199 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 200 ); 201 } 202 203 ak -= 8; 204 wk -= 8; 205 } 206 207 ARCH_X86_ASM 208 ( 209 // __ASM_EMIT(".align 16") 210 __ASM_EMIT("1:") 211 212 __ASM_EMIT("movups 0x00(%[dst]), %%xmm0") /* xmm0 = r0 r1 r2 r3 */ 213 __ASM_EMIT("movups 0x10(%[dst]), %%xmm1") /* xmm1 = i0 i1 i2 i3 */ 214 __ASM_EMIT("movups 0x20(%[dst]), %%xmm4") 215 __ASM_EMIT("movups 0x30(%[dst]), %%xmm5") 216 217 __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = r0 r1 r2 r3 */ 218 __ASM_EMIT("movaps %%xmm4, %%xmm6") 219 __ASM_EMIT("unpcklps %%xmm1, %%xmm0") /* xmm0 = r0 i0 r1 i1 */ 220 __ASM_EMIT("unpcklps %%xmm5, %%xmm4") 221 __ASM_EMIT("unpckhps %%xmm1, %%xmm2") /* xmm2 = r2 i2 r3 i3 */ 222 __ASM_EMIT("unpckhps %%xmm5, %%xmm6") 223 __ASM_EMIT("movaps %%xmm0, %%xmm1") /* xmm1 = r0 i0 r1 i1 */ 224 __ASM_EMIT("movaps %%xmm4, %%xmm5") 225 226 __ASM_EMIT("addps %%xmm2, %%xmm0") /* xmm0 = r0+r2 i0+i2 r1+r3 i1+i3 = r0k i0k r2k i2k */ 227 __ASM_EMIT("addps %%xmm6, %%xmm4") 228 __ASM_EMIT("subps %%xmm2, %%xmm1") /* xmm1 = r0-r2 i0-i2 r1-r3 i1-i3 = r1k i1k r3k i3k */ 229 __ASM_EMIT("subps %%xmm6, %%xmm5") 230 __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = r0k i0k r2k i2k */ 231 __ASM_EMIT("movaps %%xmm4, %%xmm6") 232 __ASM_EMIT("unpcklps %%xmm1, %%xmm0") /* xmm0 = r0k r1k i0k i1k */ 233 __ASM_EMIT("unpcklps %%xmm5, %%xmm4") 234 __ASM_EMIT("unpckhps %%xmm1, %%xmm2") /* xmm2 = r2k r3k i2k i3k */ 235 __ASM_EMIT("unpckhps %%xmm5, %%xmm6") 236 __ASM_EMIT("movaps %%xmm0, %%xmm1") /* xmm1 = r0k r1k i0k i1k */ 237 __ASM_EMIT("movaps %%xmm4, %%xmm5") 238 __ASM_EMIT("shufps $0x6c, %%xmm2, %%xmm2") /* xmm2 = r2k i3k i2k r3k */ 239 __ASM_EMIT("shufps $0x6c, %%xmm6, %%xmm6") 240 241 __ASM_EMIT("addps %%xmm2, %%xmm0") /* xmm0 = r0k+r2k r1k+i3k i0k+i2k i1k+r3k */ 242 __ASM_EMIT("addps %%xmm6, %%xmm4") 243 __ASM_EMIT("subps %%xmm2, %%xmm1") /* xmm1 = r0k-r2k r1k-i3k i0k-i2k i1k-r3k */ 244 __ASM_EMIT("subps %%xmm6, %%xmm5") 245 __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = r0k+r2k r1k+i3k i0k+i2k i1k+r3k */ 246 __ASM_EMIT("movaps %%xmm4, %%xmm6") 247 __ASM_EMIT("unpcklps %%xmm1, %%xmm0") /* xmm0 = r0k+r2k r0k-r2k r1k+i3k r1k-i3k */ 248 __ASM_EMIT("unpcklps %%xmm5, %%xmm4") 249 __ASM_EMIT("unpckhps %%xmm1, %%xmm2") /* xmm2 = i0k+i2k i0k-i2k i1k+r3k i1k-r3k */ 250 __ASM_EMIT("unpckhps %%xmm5, %%xmm6") 251 __ASM_EMIT("shufps $0xb4, %%xmm2, %%xmm2") /* xmm2 = i0k+i2k i0k-i2k i1k-r3k i1k+r3k */ 252 __ASM_EMIT("shufps $0xb4, %%xmm6, %%xmm6") 253 254 __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") 255 __ASM_EMIT("movups %%xmm2, 0x10(%[dst])") 256 __ASM_EMIT("movups %%xmm4, 0x20(%[dst])") 257 __ASM_EMIT("movups %%xmm6, 0x30(%[dst])") 258 259 __ASM_EMIT("add $0x40, %[dst]") 260 __ASM_EMIT("sub $16, %[k]") 261 __ASM_EMIT("jnz 1b") 262 263 : [dst] "+r" (dst), [k] "+r" (items) 264 : 265 : "cc", "memory", 266 "%xmm0", "%xmm1", "%xmm2", 267 "%xmm4", "%xmm5", "%xmm6" 268 ); 269 } 270 fastconv_parse_internal(float * dst,const float * src,size_t rank)271 static inline void fastconv_parse_internal(float *dst, const float *src, size_t rank) 272 { 273 // Prepare for butterflies 274 const float *wk = &XFFT_W[(rank-3) << 3]; 275 const float *ak = &XFFT_A[(rank-3) << 3]; 276 size_t items = size_t(1) << (rank + 1); 277 size_t bs = items; 278 size_t n = bs >> 1; 279 280 // Iterate first cycle 281 if (n > 4) 282 { 283 // ONE LARGE CYCLE 284 // Set initial values of pointers 285 float *a = dst; 286 float *b = &a[n]; 287 size_t k = n; 288 289 ARCH_X86_ASM 290 ( 291 __ASM_EMIT("movaps 0x00(%[ak]), %%xmm6") /* xmm6 = rA[i] */ 292 __ASM_EMIT("movaps 0x10(%[ak]), %%xmm7") /* xmm7 = iA[i] */ 293 __ASM_EMIT("xorps %%xmm5, %%xmm5") /* xmm5 = 0 */ 294 : 295 : [ak] "r"(ak) 296 : 297 "%xmm5", "%xmm6", "%xmm7" 298 ); 299 300 ARCH_X86_ASM 301 ( 302 // __ASM_EMIT(".align 16") 303 __ASM_EMIT("1:") 304 305 __ASM_EMIT("movups 0x00(%[src]), %%xmm0") /* xmm0 = s[i] */ 306 307 __ASM_EMIT("xorps %%xmm3, %%xmm3") /* xmm3 = 0 */ 308 __ASM_EMIT("movaps %%xmm0, %%xmm4") /* xmm4 = s[i] */ 309 __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = s[i] */ 310 __ASM_EMIT("mulps %%xmm7, %%xmm4") /* xmm4 = s[i]*iA[i] */ 311 __ASM_EMIT("mulps %%xmm6, %%xmm2") /* xmm2 = s[i]*rA[i] */ 312 __ASM_EMIT("subps %%xmm4, %%xmm3") /* xmm3 = -s[i]*iA[i] */ 313 314 __ASM_EMIT("movups %%xmm0, 0x00(%[a])") 315 __ASM_EMIT("movups %%xmm5, 0x10(%[a])") 316 __ASM_EMIT("movups %%xmm2, 0x00(%[b])") 317 __ASM_EMIT("movups %%xmm3, 0x10(%[b])") 318 319 __ASM_EMIT("add $0x10, %[src]") 320 __ASM_EMIT("add $0x20, %[a]") 321 __ASM_EMIT("add $0x20, %[b]") 322 323 /* Repeat loop */ 324 __ASM_EMIT32("subl $8, %[k]") 325 __ASM_EMIT64("sub $8, %[k]") 326 __ASM_EMIT("jz 2f") 327 328 /* Rotate angle */ 329 __ASM_EMIT("movaps 0x00(%[wk]), %%xmm0") /* xmm0 = rw[i] */ 330 __ASM_EMIT("movaps 0x10(%[wk]), %%xmm1") /* xmm1 = iw[i] */ 331 __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = rw[i] */ 332 __ASM_EMIT("movaps %%xmm1, %%xmm3") /* xmm3 = iw[i] */ 333 __ASM_EMIT("mulps %%xmm6, %%xmm3") /* xmm3 = rA[i] * iw[i] */ 334 __ASM_EMIT("mulps %%xmm7, %%xmm1") /* xmm1 = iA[i] * iw[i] */ 335 __ASM_EMIT("mulps %%xmm0, %%xmm6") /* xmm6 = rA[i] * rw[i] */ 336 __ASM_EMIT("mulps %%xmm2, %%xmm7") /* xmm7 = iA[i] * rw[i] */ 337 __ASM_EMIT("subps %%xmm1, %%xmm6") /* xmm6 = rA[i] * rw[i] - iA[i] * iw[i] */ 338 __ASM_EMIT("addps %%xmm3, %%xmm7") /* xmm7 = rA[i] * iw[i] + iA[i] * rw[i] */ 339 __ASM_EMIT("jmp 1b") 340 341 __ASM_EMIT("2:") 342 343 : [a] "+r" (a), [b] "+r" (b), [src] "+r" (src), [k] "+r" (k) 344 : [wk] "r" (wk) 345 : "cc", "memory", 346 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 347 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 348 ); 349 350 wk -= 8; 351 ak -= 8; 352 n >>= 1; 353 bs >>= 1; 354 } 355 else 356 { 357 // Unpack 4x real to 4x split complex 358 ARCH_X86_ASM 359 ( 360 __ASM_EMIT("movups 0x00(%[src]), %%xmm0") /* xmm0 = s[i] */ 361 __ASM_EMIT("xorps %%xmm1, %%xmm1") /* xmm1 = 0 */ 362 __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") 363 __ASM_EMIT("movups %%xmm1, 0x10(%[dst])") 364 : 365 : [src] "r" (src), [dst] "r" (dst) 366 : "cc", "memory", 367 "%xmm0", "%xmm1" 368 ); 369 } 370 371 // Iterate butterflies 372 for (; n > 4; n >>= 1, bs >>= 1) 373 { 374 for (size_t p=0; p<items; p += bs) 375 { 376 // Set initial values of pointers 377 float *a = &dst[p]; 378 float *b = &a[n]; 379 size_t k = n; 380 381 ARCH_X86_ASM 382 ( 383 __ASM_EMIT("movaps 0x00(%[ak]), %%xmm6") /* xmm6 = rA[i] */ 384 __ASM_EMIT("movaps 0x10(%[ak]), %%xmm7") /* xmm7 = iA[i] */ 385 : 386 : [ak] "r"(ak) 387 : "%xmm6", "%xmm7" 388 ); 389 390 ARCH_X86_ASM 391 ( 392 __ASM_EMIT("1:") 393 394 __ASM_EMIT("movups 0x00(%[a]), %%xmm0") /* xmm0 = ra[i] */ 395 __ASM_EMIT("movups 0x10(%[a]), %%xmm1") /* xmm1 = ia[i] */ 396 __ASM_EMIT("movups 0x00(%[b]), %%xmm2") /* xmm2 = rb[i] */ 397 __ASM_EMIT("movups 0x10(%[b]), %%xmm3") /* xmm3 = ib[i] */ 398 399 __ASM_EMIT("movaps %%xmm0, %%xmm4") /* xmm4 = ra[i] */ 400 __ASM_EMIT("movaps %%xmm1, %%xmm5") /* xmm5 = ia[i] */ 401 __ASM_EMIT("addps %%xmm2, %%xmm0") /* xmm0 = ra[i]+rb[i] */ 402 __ASM_EMIT("addps %%xmm3, %%xmm1") /* xmm1 = ia[i]+ib[i] */ 403 __ASM_EMIT("subps %%xmm2, %%xmm4") /* xmm4 = ra[i]-rb[i] = rc[i] */ 404 __ASM_EMIT("subps %%xmm3, %%xmm5") /* xmm5 = ia[i]-ib[i] = ic[i] */ 405 __ASM_EMIT("movaps %%xmm4, %%xmm2") /* xmm2 = rc[i] */ 406 __ASM_EMIT("movaps %%xmm5, %%xmm3") /* xmm3 = ic[i] */ 407 __ASM_EMIT("mulps %%xmm6, %%xmm4") /* xmm4 = rA[i]*rc[i] */ 408 __ASM_EMIT("mulps %%xmm7, %%xmm5") /* xmm5 = iA[i]*ic[i] */ 409 __ASM_EMIT("mulps %%xmm6, %%xmm3") /* xmm3 = rA[i]*ic[i] */ 410 __ASM_EMIT("mulps %%xmm7, %%xmm2") /* xmm2 = iA[i]*rc[i] */ 411 __ASM_EMIT("addps %%xmm5, %%xmm4") /* xmm4 = rA[i]*rc[i] + iA[i]*ic[i] */ 412 __ASM_EMIT("subps %%xmm2, %%xmm3") /* xmm3 = rA[i]*ic[i] - iA[i]*rc[i] */ 413 414 __ASM_EMIT("movups %%xmm0, 0x00(%[a])") 415 __ASM_EMIT("movups %%xmm1, 0x10(%[a])") 416 __ASM_EMIT("movups %%xmm4, 0x00(%[b])") 417 __ASM_EMIT("movups %%xmm3, 0x10(%[b])") 418 419 /* Repeat loop */ 420 __ASM_EMIT("add $0x20, %[a]") 421 __ASM_EMIT("add $0x20, %[b]") 422 __ASM_EMIT("sub $8, %[k]") 423 __ASM_EMIT("jz 2f") 424 425 /* Rotate angle */ 426 __ASM_EMIT("movaps 0x00(%[wk]), %%xmm0") /* xmm0 = rw */ 427 __ASM_EMIT("movaps 0x10(%[wk]), %%xmm1") /* xmm1 = iw */ 428 __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = rw */ 429 __ASM_EMIT("movaps %%xmm1, %%xmm3") /* xmm3 = iw */ 430 __ASM_EMIT("mulps %%xmm6, %%xmm3") /* xmm3 = ra * iw */ 431 __ASM_EMIT("mulps %%xmm7, %%xmm1") /* xmm1 = ia * iw */ 432 __ASM_EMIT("mulps %%xmm0, %%xmm6") /* xmm6 = ra * rw */ 433 __ASM_EMIT("mulps %%xmm2, %%xmm7") /* xmm7 = ia * rw */ 434 __ASM_EMIT("subps %%xmm1, %%xmm6") /* xmm6 = ra * rw - ia * iw */ 435 __ASM_EMIT("addps %%xmm3, %%xmm7") /* xmm7 = ra * iw + ia * rw */ 436 __ASM_EMIT("jmp 1b") 437 438 __ASM_EMIT("2:") 439 440 : [a] "+r" (a), [b] "+r" (b), [k] "+r" (k) 441 : [wk] "r"(wk) 442 : "cc", "memory", 443 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 444 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 445 ); 446 } 447 448 ak -= 8; 449 wk -= 8; 450 } 451 } 452 } 453