1//typedef struct { 2// uint64_t n; 3// double* trig_tables; 4//} IFFT_PRECOMP; 5 6/* 7 * Storage usage: 8 * Bytes Location Description 9 * 8 rcx Loop counter 10 * 8 rdx Size of FFT (i.e. number of elements in the vector) (a power of 2), constant 11 * 8 rdi Base of real components array, constant (64-bit floats) 12 * 8 rsi Base of imaginary components array, constant (64-bit floats) 13 * 8 r8 Base of trigonometric tables array (64-bit floats) 14 * 8 r9 Base of bit reversal array (64-bit ints), loop counter 15 * 8 rax Temporary, loop counter 16 * 8 r10 Temporary 17 * 8 r11 Temporary 18 * 8 r12 Temporary 19 * 8 r13 Temporary 20 * 8 rsp x86-64 stack pointer 21 * 320 ymm0-9 Temporary (64-bit float vectors) 22 * 64 ymm14-15 Multiplication constants (64-bit float vectors) 23 * 8 [rsp+ 0] Caller's value of r13 24 * 8 [rsp+ 8] Caller's value of r12 25 * 8 [rsp+16] Caller's value of r11 26 * 8 [rsp+24] Caller's value of r10 27 */ 28 29/* void ifft(const void *tables, double *real) */ 30.globl ifft 31ifft: 32 /* Save registers */ 33 pushq %r10 34 pushq %r11 35 pushq %r12 36 pushq %r13 37 pushq %r14 38 pushq %rbx 39 40 /* Permute registers for better variable names */ 41 movq %rdi, %rax 42 movq %rsi, %rdi /* rsi: base of the real data */ 43 44 //IFFT_PRECOMP* fft_tables = (IFFT_PRECOMP*) tables; 45 //const int32_t n = fft_tables->n; 46 //const double* trig_tables = fft_tables->trig_tables; 47 48 /* Load struct FftTables fields */ 49 movq 0(%rax), %rdx /* rdx: Size of FFT (a power of 2, must be at least 4) */ 50 movq 8(%rax), %r8 /* r8: Base address of trigonometric tables array */ 51 52 //int32_t ns4 = n/4; 53 //double* are = c; //size n/4 (x8 because doubles) 54 //double* aim = c+ns4; //size n/4 55 movq %rdx, %r10 56 shl $1, %r10 57 add %r10, %rsi /* rsi: base of the imaginary data */ 58 59 // //multiply by omega^j 60 // for (int32_t j=0; j<ns4; j+=4) { 61 // const double* r0 = trig_tables+2*j; 62 // const double* r1 = r0+4; 63 // //(re*cos-im*sin) + i (im*cos+re*sin) 64 // double* d0 = are+j; 65 // double* d1 = aim+j; 66 // dotp4(tmp0,d0,r0); //re*cos 67 // dotp4(tmp1,d1,r0); //im*cos 68 // dotp4(tmp2,d0,r1); //re*sin 69 // dotp4(tmp3,d1,r1); //im*sin 70 // sub4(d0,tmp0,tmp3); 71 // add4(d1,tmp1,tmp2); 72 // } 73 74 shr $3, %r10 /* now, r10 is n/4 (the last iteration) */ 75 movq $0, %rcx /* Loop counter: Range [0, r10), step size 4 */ 76 movq %r8, %r11 /* r11 is the trig table pointer, step size 64 */ 77firstloop: 78 vmovupd (%rdi,%rcx,8), %ymm0 /* real */ 79 vmovupd (%rsi,%rcx,8), %ymm1 /* imag */ 80 vmovupd 0(%r11), %ymm2 /* cos */ 81 vmovupd 32(%r11), %ymm3 /* sin */ 82 vmulpd %ymm0, %ymm2, %ymm4 /* re*cos */ 83 vmulpd %ymm0, %ymm3, %ymm5 /* re*sin */ 84 vmulpd %ymm1, %ymm2, %ymm6 /* im*cos */ 85 vmulpd %ymm1, %ymm3, %ymm7 /* im*sin */ 86 vsubpd %ymm7, %ymm4, %ymm0 /* y4-y7 -> new re */ 87 vaddpd %ymm5, %ymm6, %ymm1 /* -> new im */ 88 vmovupd %ymm0, (%rdi,%rcx,8) 89 vmovupd %ymm1, (%rsi,%rcx,8) 90 //next iteration 91 leaq 64(%r11), %r11 92 addq $4,%rcx 93 cmpq %r10,%rcx 94 jb firstloop 95 96 97/* 98 const double* cur_tt = trig_tables; 99 for (int32_t nn=ns4; nn>=8; nn/=2) { 100 int32_t halfnn = nn/2; 101 cur_tt += 2*nn; 102 for (int32_t block=0; block<ns4; block+=nn) { 103 for (int32_t off=0; off<halfnn; off+=4) { 104 double* d00 = are + block + off; 105 double* d01 = aim + block + off; 106 double* d10 = are + block + halfnn + off; 107 double* d11 = aim + block + halfnn + off; 108 add4(tmp0,d00,d10); // re + re 109 add4(tmp1,d01,d11); // im + im 110 sub4(tmp2,d00,d10); // re - re 111 sub4(tmp3,d01,d11); // im - im 112 copy4(d00,tmp0); 113 copy4(d01,tmp1); 114 const double* r0 = cur_tt+2*off; 115 const double* r1 = r0+4; 116 dotp4(tmp0,tmp2,r0); //re*cos 117 dotp4(tmp1,tmp3,r1); //im*sin 118 sub4(d10,tmp0,tmp1); 119 dotp4(tmp0,tmp2,r1); //re*sin 120 dotp4(tmp1,tmp3,r0); //im*cos 121 add4(d11,tmp0,tmp1); 122 } 123 } 124 } 125*/ 126 /* r10 is still = n/4 (constant) */ 127 /* r8 is cur_tt (initially, base of trig table) */ 128 movq %r10,%r12 /* r12 (nn): outer loop counter from n/4 to 8 */ 129nnloop: 130 movq %r12,%r13 131 shr $1,%r13 /* r13 = halfnn */ 132 leaq (%r8,%r12,8),%r8 133 leaq (%r8,%r12,8),%r8 /* update cur_tt += nn*16 */ 134 movq $0,%r11 /* r11 (block) */ 135blockloop: 136 leaq (%rdi,%r11,8),%rax /* are + block */ 137 leaq (%rsi,%r11,8),%rbx /* are + block */ 138 leaq (%rax,%r13,8),%rcx /* are + block + halfnn */ 139 leaq (%rbx,%r13,8),%rdx /* aim + block + halfnn */ 140 movq $0,%r9 /* r9 (off) */ 141 movq %r8,%r14 /* r14 : cur_tt + 16*off */ 142offloop: 143 vmovupd (%rax,%r9,8), %ymm0 /* d00 */ 144 vmovupd (%rbx,%r9,8), %ymm1 /* d01 */ 145 vmovupd (%rcx,%r9,8), %ymm2 /* d10 */ 146 vmovupd (%rdx,%r9,8), %ymm3 /* d11 */ 147 vaddpd %ymm0,%ymm2,%ymm4 /* tmp0 */ 148 vaddpd %ymm1,%ymm3,%ymm5 /* tmp1 */ 149 vsubpd %ymm2,%ymm0,%ymm6 /* tmp2 */ 150 vsubpd %ymm3,%ymm1,%ymm7 /* tmp3 */ 151 vmovupd %ymm4,(%rax,%r9,8) 152 vmovupd %ymm5,(%rbx,%r9,8) 153 vmovupd (%r14),%ymm8 /* r0 = cos */ 154 vmovupd 32(%r14),%ymm9 /* r1 = sin */ 155 vmulpd %ymm6,%ymm8,%ymm4 156 vmulpd %ymm7,%ymm9,%ymm5 157 vsubpd %ymm5,%ymm4,%ymm10 158 vmovupd %ymm10,(%rcx,%r9,8) 159 vmulpd %ymm6,%ymm9,%ymm4 160 vmulpd %ymm7,%ymm8,%ymm5 161 vaddpd %ymm4,%ymm5,%ymm10 162 vmovupd %ymm10,(%rdx,%r9,8) 163 /* end of off loop */ 164 leaq 64(%r14),%r14 165 addq $4,%r9 166 cmpq %r13,%r9 167 jb offloop 168 /* end of block loop */ 169 addq %r12,%r11 170 cmpq %r10,%r11 171 jb blockloop 172 /* end of nn loop */ 173 movq %r13,%r12 174 cmpq $8,%r12 175 jae nnloop 176 177/* 178 //size 4 loop 179 { 180 for (int32_t block=0; block<ns4; block+=4) { 181 double* d0 = are+block; 182 double* d1 = aim+block; 183 tmp0[0]=d0[0]; 184 tmp0[1]=d0[1]; 185 tmp0[2]=d0[0]; 186 tmp0[3]=-d1[1]; 187 tmp1[0]=d0[2]; 188 tmp1[1]=d0[3]; 189 tmp1[2]=-d0[2]; 190 tmp1[3]=d1[3]; 191 tmp2[0]=d1[0]; 192 tmp2[1]=d1[1]; 193 tmp2[2]=d1[0]; 194 tmp2[3]=d0[1]; 195 tmp3[0]=d1[2]; 196 tmp3[1]=d1[3]; 197 tmp3[2]=-d1[2]; 198 tmp3[3]=-d0[3]; 199 add4(d0,tmp0,tmp1); 200 add4(d1,tmp2,tmp3); 201 } 202 } 203*/ 204 /* r10 is still = n/4 (constant) */ 205 vmovapd size4negation0, %ymm15 206 vmovapd size4negation1, %ymm14 207 vmovapd size4negation2, %ymm13 208 vmovapd size4negation3, %ymm12 209 movq $0,%rax /* rax (block) */ 210 movq %rdi,%r11 /* r11 (are+block) */ 211 movq %rsi,%r12 /* r12 (aim+block) */ 212size4loop: 213 vmovupd (%r11),%ymm0 /* r0 r1 r2 r3 */ 214 vmovupd (%r12),%ymm1 /* i0 i1 i2 i3 */ 215 vshufpd $10,%ymm1,%ymm0,%ymm2 /* r0 i1 r2 i3 */ 216 vshufpd $10,%ymm0,%ymm1,%ymm3 /* i0 r1 i2 r3 */ 217 vperm2f128 $32,%ymm2,%ymm0,%ymm4 /* r0 r1 r0 i1 */ 218 vperm2f128 $49,%ymm2,%ymm0,%ymm5 /* r2 r3 r2 i3 */ 219 vperm2f128 $32,%ymm3,%ymm1,%ymm6 /* i0 i1 i0 r1 */ 220 vperm2f128 $49,%ymm3,%ymm1,%ymm7 /* i2 i3 i2 r3 */ 221 vmulpd %ymm4,%ymm15,%ymm4 /* r0 r1 r0 -i1 */ 222 vmulpd %ymm5,%ymm14,%ymm5 /* r2 r3 -r2 i3 */ 223 vmulpd %ymm7,%ymm13,%ymm7 /* i2 i3 -i2 -r3 */ 224 vaddpd %ymm4,%ymm5,%ymm0 225 vaddpd %ymm6,%ymm7,%ymm1 226 vmovupd %ymm0,(%r11) 227 vmovupd %ymm1,(%r12) 228 /* end of loop */ 229 leaq 32(%r11),%r11 230 leaq 32(%r12),%r12 231 addq $4,%rax 232 cmpq %r10,%rax 233 jb size4loop 234 235 236/* 237 //size 2 238 { 239 for (int32_t block=0; block<ns4; block+=4) { 240 double* d0 = are+block; 241 double* d1 = aim+block; 242 tmp0[0]=d0[0]; 243 tmp0[1]=d0[0]; 244 tmp0[2]=d0[2]; 245 tmp0[3]=d0[2]; 246 tmp1[0]=d0[1]; 247 tmp1[1]=-d0[1]; 248 tmp1[2]=d0[3]; 249 tmp1[3]=-d0[3]; 250 add4(d0,tmp0,tmp1); 251 tmp0[0]=d1[0]; 252 tmp0[1]=d1[0]; 253 tmp0[2]=d1[2]; 254 tmp0[3]=d1[2]; 255 tmp1[0]=d1[1]; 256 tmp1[1]=-d1[1]; 257 tmp1[2]=d1[3]; 258 tmp1[3]=-d1[3]; 259 add4(d1,tmp0,tmp1); 260 } 261 } 262} 263*/ 264 movq $0,%rax /* rax (block) */ 265 movq %rdi,%r11 /* r11 (are+block) */ 266 movq %rsi,%r12 /* r12 (aim+block) */ 267size2loop: 268 vmovupd (%r11),%ymm0 /* r0 r1 r2 r3 */ 269 vmovupd (%r12),%ymm1 /* i0 i1 i2 i3 */ 270 vshufpd $0,%ymm0,%ymm0,%ymm2 /* r0 r0 r2 r2 */ 271 vshufpd $15,%ymm0,%ymm0,%ymm3 /* r1 r1 r3 r3 */ 272 vshufpd $0,%ymm1,%ymm1,%ymm4 /* i0 i0 i2 i2 */ 273 vshufpd $15,%ymm1,%ymm1,%ymm5 /* i1 i1 i3 i3 */ 274 vmulpd %ymm3,%ymm12,%ymm3 275 vmulpd %ymm5,%ymm12,%ymm5 276 vaddpd %ymm2,%ymm3,%ymm0 277 vaddpd %ymm4,%ymm5,%ymm1 278 vmovupd %ymm0,(%r11) 279 vmovupd %ymm1,(%r12) 280 /* end of loop */ 281 leaq 32(%r11),%r11 282 leaq 32(%r12),%r12 283 addq $4,%rax 284 cmpq %r10,%rax 285 jb size2loop 286 287 288 /* Restore registers */ 289end: 290 vzeroall 291 popq %rbx 292 popq %r14 293 popq %r13 294 popq %r12 295 popq %r11 296 popq %r10 297 retq 298 299 300/* Constants for YMM */ 301.balign 32 302size4negation0: .double +1.0, +1.0, +1.0, -1.0 303size4negation1: .double +1.0, +1.0, -1.0, +1.0 304size4negation2: .double +1.0, +1.0, -1.0, -1.0 305size4negation3: .double +1.0, -1.0, +1.0, -1.0 306 307 308 309