1//typedef struct  {
2//    uint64_t n;
3//    double* trig_tables;
4//} IFFT_PRECOMP;
5
6/*
7 * Storage usage:
8 *    Bytes  Location  Description
9 *        8  rcx       Loop counter
10 *        8  rdx       Size of FFT (i.e. number of elements in the vector) (a power of 2), constant
11 *        8  rdi       Base of real components array, constant (64-bit floats)
12 *        8  rsi       Base of imaginary components array, constant (64-bit floats)
13 *        8  r8        Base of trigonometric tables array (64-bit floats)
14 *        8  r9        Base of bit reversal array (64-bit ints), loop counter
15 *        8  rax       Temporary, loop counter
16 *        8  r10       Temporary
17 *        8  r11       Temporary
18 *        8  r12       Temporary
19 *        8  r13       Temporary
20 *        8  rsp       x86-64 stack pointer
21 *      320  ymm0-9    Temporary (64-bit float vectors)
22 *       64  ymm14-15  Multiplication constants (64-bit float vectors)
23 *        8  [rsp+ 0]  Caller's value of r13
24 *        8  [rsp+ 8]  Caller's value of r12
25 *        8  [rsp+16]  Caller's value of r11
26 *        8  [rsp+24]  Caller's value of r10
27 */
28
29/* void ifft(const void *tables, double *real) */
30.globl ifft
31ifft:
32	/* Save registers */
33	pushq       %r10
34	pushq       %r11
35	pushq       %r12
36	pushq       %r13
37	pushq       %r14
38	pushq       %rbx
39
40	/* Permute registers for better variable names */
41	movq        %rdi, %rax
42	movq        %rsi, %rdi      /* rsi: base of the real data */
43
44        //IFFT_PRECOMP* fft_tables = (IFFT_PRECOMP*) tables;
45        //const int32_t n = fft_tables->n;
46        //const double* trig_tables = fft_tables->trig_tables;
47
48	/* Load struct FftTables fields */
49	movq         0(%rax), %rdx  /* rdx: Size of FFT (a power of 2, must be at least 4) */
50	movq         8(%rax), %r8   /* r8: Base address of trigonometric tables array */
51
52        //int32_t ns4 = n/4;
53        //double* are = c;    //size n/4 (x8 because doubles)
54        //double* aim = c+ns4; //size n/4
55	movq	%rdx, %r10
56	shl	$1, %r10
57	add	%r10, %rsi          /* rsi: base of the imaginary data */
58
59	//    //multiply by omega^j
60	//    for (int32_t j=0; j<ns4; j+=4) {
61	//	const double* r0 = trig_tables+2*j;
62	//	const double* r1 = r0+4;
63	//	//(re*cos-im*sin) + i (im*cos+re*sin)
64	//	double* d0 = are+j;
65	//	double* d1 = aim+j;
66	//	dotp4(tmp0,d0,r0); //re*cos
67	//	dotp4(tmp1,d1,r0); //im*cos
68	//	dotp4(tmp2,d0,r1); //re*sin
69	//	dotp4(tmp3,d1,r1); //im*sin
70	//	sub4(d0,tmp0,tmp3);
71	//	add4(d1,tmp1,tmp2);
72	//    }
73
74	shr	$3, %r10  /* now, r10 is n/4 (the last iteration) */
75	movq    $0, %rcx  /* Loop counter: Range [0, r10), step size 4 */
76	movq	%r8, %r11 /* r11 is the trig table pointer, step size 64 */
77firstloop:
78        vmovupd (%rdi,%rcx,8), %ymm0 /* real */
79        vmovupd (%rsi,%rcx,8), %ymm1 /* imag */
80	vmovupd 0(%r11), %ymm2 /* cos */
81	vmovupd 32(%r11), %ymm3 /* sin */
82        vmulpd  %ymm0, %ymm2, %ymm4 /* re*cos */
83        vmulpd  %ymm0, %ymm3, %ymm5 /* re*sin */
84	vmulpd  %ymm1, %ymm2, %ymm6 /* im*cos */
85	vmulpd  %ymm1, %ymm3, %ymm7 /* im*sin */
86	vsubpd  %ymm7, %ymm4, %ymm0 /* y4-y7 -> new re */
87	vaddpd  %ymm5, %ymm6, %ymm1 /* -> new im */
88	vmovupd %ymm0, (%rdi,%rcx,8)
89	vmovupd %ymm1, (%rsi,%rcx,8)
90        //next iteration
91	leaq    64(%r11), %r11
92	addq	$4,%rcx
93	cmpq	%r10,%rcx
94	jb	firstloop
95
96
97/*
98    const double* cur_tt = trig_tables;
99    for (int32_t nn=ns4; nn>=8; nn/=2) {
100	int32_t halfnn = nn/2;
101	cur_tt += 2*nn;
102	for (int32_t block=0; block<ns4; block+=nn) {
103	    for (int32_t off=0; off<halfnn; off+=4) {
104		double* d00 = are + block + off;
105		double* d01 = aim + block + off;
106		double* d10 = are + block + halfnn + off;
107		double* d11 = aim + block + halfnn + off;
108		add4(tmp0,d00,d10); // re + re
109		add4(tmp1,d01,d11); // im + im
110		sub4(tmp2,d00,d10); // re - re
111		sub4(tmp3,d01,d11); // im - im
112		copy4(d00,tmp0);
113		copy4(d01,tmp1);
114		const double* r0 = cur_tt+2*off;
115		const double* r1 = r0+4;
116		dotp4(tmp0,tmp2,r0); //re*cos
117		dotp4(tmp1,tmp3,r1); //im*sin
118		sub4(d10,tmp0,tmp1);
119		dotp4(tmp0,tmp2,r1); //re*sin
120		dotp4(tmp1,tmp3,r0); //im*cos
121		add4(d11,tmp0,tmp1);
122	    }
123	}
124    }
125*/
126        /* r10 is still = n/4  (constant) */
127        /* r8 is cur_tt (initially, base of trig table) */
128	movq %r10,%r12 /* r12 (nn): outer loop counter from n/4 to 8 */
129nnloop:
130	movq %r12,%r13
131	shr  $1,%r13   /* r13 = halfnn */
132	leaq (%r8,%r12,8),%r8
133	leaq (%r8,%r12,8),%r8 /* update cur_tt += nn*16 */
134	movq $0,%r11   /* r11 (block) */
135blockloop:
136	leaq (%rdi,%r11,8),%rax /* are + block */
137	leaq (%rsi,%r11,8),%rbx /* are + block */
138	leaq (%rax,%r13,8),%rcx /* are + block + halfnn */
139	leaq (%rbx,%r13,8),%rdx /* aim + block + halfnn */
140	movq $0,%r9   /* r9 (off) */
141	movq %r8,%r14 /* r14 : cur_tt + 16*off */
142offloop:
143        vmovupd (%rax,%r9,8), %ymm0  /* d00 */
144        vmovupd (%rbx,%r9,8), %ymm1  /* d01 */
145        vmovupd (%rcx,%r9,8), %ymm2  /* d10 */
146        vmovupd (%rdx,%r9,8), %ymm3  /* d11 */
147	vaddpd %ymm0,%ymm2,%ymm4 /* tmp0 */
148	vaddpd %ymm1,%ymm3,%ymm5 /* tmp1 */
149	vsubpd %ymm2,%ymm0,%ymm6 /* tmp2 */
150	vsubpd %ymm3,%ymm1,%ymm7 /* tmp3 */
151	vmovupd %ymm4,(%rax,%r9,8)
152        vmovupd %ymm5,(%rbx,%r9,8)
153        vmovupd (%r14),%ymm8 /* r0 = cos */
154	vmovupd 32(%r14),%ymm9 /* r1 = sin */
155	vmulpd %ymm6,%ymm8,%ymm4
156	vmulpd %ymm7,%ymm9,%ymm5
157	vsubpd %ymm5,%ymm4,%ymm10
158	vmovupd %ymm10,(%rcx,%r9,8)
159        vmulpd %ymm6,%ymm9,%ymm4
160	vmulpd %ymm7,%ymm8,%ymm5
161	vaddpd %ymm4,%ymm5,%ymm10
162	vmovupd %ymm10,(%rdx,%r9,8)
163        /* end of off loop */
164       	leaq 64(%r14),%r14
165        addq $4,%r9
166	cmpq %r13,%r9
167        jb offloop
168	/* end of block loop */
169	addq %r12,%r11
170	cmpq %r10,%r11
171	jb blockloop
172	/* end of nn loop */
173	movq %r13,%r12
174	cmpq $8,%r12
175	jae nnloop
176
177/*
178    //size 4 loop
179    {
180	for (int32_t block=0; block<ns4; block+=4) {
181	    double* d0 = are+block;
182	    double* d1 = aim+block;
183	    tmp0[0]=d0[0];
184	    tmp0[1]=d0[1];
185	    tmp0[2]=d0[0];
186	    tmp0[3]=-d1[1];
187	    tmp1[0]=d0[2];
188	    tmp1[1]=d0[3];
189	    tmp1[2]=-d0[2];
190	    tmp1[3]=d1[3];
191	    tmp2[0]=d1[0];
192	    tmp2[1]=d1[1];
193	    tmp2[2]=d1[0];
194	    tmp2[3]=d0[1];
195	    tmp3[0]=d1[2];
196	    tmp3[1]=d1[3];
197	    tmp3[2]=-d1[2];
198	    tmp3[3]=-d0[3];
199	    add4(d0,tmp0,tmp1);
200	    add4(d1,tmp2,tmp3);
201	}
202    }
203*/
204    	/* r10 is still = n/4  (constant) */
205	vmovapd     size4negation0, %ymm15
206	vmovapd     size4negation1, %ymm14
207	vmovapd     size4negation2, %ymm13
208	vmovapd     size4negation3, %ymm12
209	movq $0,%rax /* rax (block) */
210	movq %rdi,%r11 /* r11 (are+block) */
211	movq %rsi,%r12 /* r12 (aim+block) */
212size4loop:
213	vmovupd (%r11),%ymm0 /* r0 r1 r2 r3 */
214	vmovupd (%r12),%ymm1 /* i0 i1 i2 i3 */
215	vshufpd $10,%ymm1,%ymm0,%ymm2 /* r0 i1 r2 i3 */
216	vshufpd $10,%ymm0,%ymm1,%ymm3 /* i0 r1 i2 r3 */
217	vperm2f128 $32,%ymm2,%ymm0,%ymm4 /* r0 r1 r0 i1 */
218	vperm2f128 $49,%ymm2,%ymm0,%ymm5 /* r2 r3 r2 i3 */
219	vperm2f128 $32,%ymm3,%ymm1,%ymm6 /* i0 i1 i0 r1 */
220	vperm2f128 $49,%ymm3,%ymm1,%ymm7 /* i2 i3 i2 r3 */
221	vmulpd	%ymm4,%ymm15,%ymm4 /* r0 r1 r0 -i1 */
222	vmulpd	%ymm5,%ymm14,%ymm5 /* r2 r3 -r2 i3 */
223	vmulpd	%ymm7,%ymm13,%ymm7 /* i2 i3 -i2 -r3 */
224	vaddpd  %ymm4,%ymm5,%ymm0
225	vaddpd  %ymm6,%ymm7,%ymm1
226	vmovupd %ymm0,(%r11)
227	vmovupd %ymm1,(%r12)
228        /* end of loop */
229        leaq 32(%r11),%r11
230        leaq 32(%r12),%r12
231        addq $4,%rax
232	cmpq %r10,%rax
233	jb size4loop
234
235
236/*
237    //size 2
238    {
239	for (int32_t block=0; block<ns4; block+=4) {
240	    double* d0 = are+block;
241	    double* d1 = aim+block;
242	    tmp0[0]=d0[0];
243	    tmp0[1]=d0[0];
244	    tmp0[2]=d0[2];
245	    tmp0[3]=d0[2];
246	    tmp1[0]=d0[1];
247	    tmp1[1]=-d0[1];
248	    tmp1[2]=d0[3];
249	    tmp1[3]=-d0[3];
250	    add4(d0,tmp0,tmp1);
251	    tmp0[0]=d1[0];
252	    tmp0[1]=d1[0];
253	    tmp0[2]=d1[2];
254	    tmp0[3]=d1[2];
255	    tmp1[0]=d1[1];
256	    tmp1[1]=-d1[1];
257	    tmp1[2]=d1[3];
258	    tmp1[3]=-d1[3];
259	    add4(d1,tmp0,tmp1);
260	}
261    }
262}
263*/
264	movq $0,%rax /* rax (block) */
265	movq %rdi,%r11 /* r11 (are+block) */
266	movq %rsi,%r12 /* r12 (aim+block) */
267size2loop:
268	vmovupd (%r11),%ymm0 /* r0 r1 r2 r3 */
269	vmovupd (%r12),%ymm1 /* i0 i1 i2 i3 */
270	vshufpd $0,%ymm0,%ymm0,%ymm2 /* r0 r0 r2 r2 */
271	vshufpd $15,%ymm0,%ymm0,%ymm3 /* r1 r1 r3 r3 */
272	vshufpd $0,%ymm1,%ymm1,%ymm4 /* i0 i0 i2 i2 */
273	vshufpd $15,%ymm1,%ymm1,%ymm5 /* i1 i1 i3 i3 */
274	vmulpd  %ymm3,%ymm12,%ymm3
275	vmulpd  %ymm5,%ymm12,%ymm5
276	vaddpd  %ymm2,%ymm3,%ymm0
277	vaddpd  %ymm4,%ymm5,%ymm1
278	vmovupd %ymm0,(%r11)
279	vmovupd %ymm1,(%r12)
280	/* end of loop */
281        leaq 32(%r11),%r11
282        leaq 32(%r12),%r12
283        addq $4,%rax
284	cmpq %r10,%rax
285	jb size2loop
286
287
288	/* Restore registers */
289end:
290	vzeroall
291	popq        %rbx
292	popq        %r14
293	popq        %r13
294	popq        %r12
295	popq        %r11
296	popq        %r10
297	retq
298
299
300/* Constants for YMM */
301.balign 32
302size4negation0: .double +1.0, +1.0, +1.0, -1.0
303size4negation1: .double +1.0, +1.0, -1.0, +1.0
304size4negation2: .double +1.0, +1.0, -1.0, -1.0
305size4negation3: .double +1.0, -1.0, +1.0, -1.0
306
307
308
309