1; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA
2
3; GOGO-no-coda
4;	Copyright (C) 1999 shigeo
5;	special thanks to URURI
6
7%include "nasm.h"
8
9	externdef costab_fft
10	externdef sintab_fft
11
12	segment_data
13	align 32
14D_1_41421	dd	1.41421356
15D_1_0	dd	1.0
16D_0_5	dd	0.5
17D_0_25	dd	0.25
18D_0_0005	dd	0.0005
19D_0_0	dd	0.0
20
21	segment_code
22
23;void fht(float *fz, int n);
24proc	fht_FPU
25
26%$fz	arg	4
27%$n	arg	4
28
29%$k	local	4
30
31%$f0	local	4
32%$f1	local	4
33%$f2	local	4
34%$f3	local	4
35
36%$g0	local	4
37%$g1	local	4
38%$g2	local	4
39%$g3	local	4
40
41%$s1	local	4
42%$c1	local	4
43%$s2	local	4
44%$c2	local	4
45
46%$t_s	local	4
47%$t_c	local	4
48	alloc
49
50	pushd	ebp, ebx, esi, edi
51
52fht_FPU_1st_part:
53
54fht_FPU_2nd_part:
55
56fht_FPU_3rd_part:
57
58.do_init:
59	mov	r3, 16		;k1*fsize = 4*fsize = k4
60	mov	r4, 8		;kx = k1/2
61	mov	r2, 48		;k3*fsize
62	mov	dword [sp(%$k)], 2	;k = 2
63	mov	r0, [sp(%$fz)]	;fi
64	lea	r1, [r0+8]		;gi = fi + kx
65
66.do:
67.do2:
68	;f
69	fld	dword [r0]
70	fsub	dword [r0+r3]
71
72	fld	dword [r0]
73	fadd	dword [r0+r3]
74
75	fld	dword [r0+r3*2]
76	fsub	dword [r0+r2]
77
78	fld	dword [r0+r3*2]
79	fadd	dword [r0+r2]		;f2 f3 f0 f1
80
81	fld	st2			;f0 f2 f3 f0 f1
82	fadd	st0, st1
83	fstp	dword [r0]		;fi[0]
84
85	fld	st3			;f1 f2 f3 f0 f1
86	fadd	st0, st2
87	fstp	dword [r0+r3]		;fi[k1]
88
89	fsubr	st0, st2		;f0-f2 f3 f0 f1
90	fstp	dword [r0+r3*2]		;fi[k2]
91
92	fsubr	st0, st2		;f1-f3 f0 f1
93	fstp	dword [r0+r2]		;fi[k3]
94	fcompp
95
96	;g
97	fld	dword [r1]
98	fsub	dword [r1+r3]
99
100	fld	dword [r1]
101	fadd	dword [r1+r3]
102
103	fld	dword [D_1_41421]
104	fmul	dword [r1+r2]
105
106	fld	dword [D_1_41421]
107	fmul	dword [r1+r3*2]		;g2 g3 g0 g1
108
109	fld	st2			;g0 g2 g3 g0 g1
110	fadd	st0, st1
111	fstp	dword [r1]		;gi[0]
112
113	fld	st3			;g1 g2 g3 g0 g1
114	fadd	st0, st2
115	fstp	dword [r1+r3]		;gi[k1]
116
117	fsubr	st0, st2		;g0-g2 g3 g0 g1
118	fstp	dword [r1+r3*2]		;gi[k2]
119
120	fsubr	st0, st2		;g1-g3 g0 g1
121	fstp	dword [r1+r2]		;gi[k3]
122	fcompp
123
124	lea	r0, [r0+r3*4]
125	lea	r1, [r1+r3*4]
126	cmp	r0, r6
127	jb	.do2
128
129
130	mov	r0, [sp(%$k)]
131	fld	dword [costab_fft +r0*4]
132	fstp	dword [sp(%$t_c)]
133	fld	dword [sintab_fft +r0*4]
134	fstp	dword [sp(%$t_s)]
135	fld	dword [D_1_0]
136	fstp	dword [sp(%$c1)]
137	fld	dword [D_0_0]
138	fstp	dword [sp(%$s1)]
139
140.for_init:
141	mov	r5, 4		;i = 1*fsize
142
143.for:
144	fld	dword [sp(%$c1)]
145	fmul	dword [sp(%$t_c)]
146	fld	dword [sp(%$s1)]
147	fmul	dword [sp(%$t_s)]
148	fsubp	st1, st0		;c1
149
150	fld	dword [sp(%$c1)]
151	fmul	dword [sp(%$t_s)]
152	fld	dword [sp(%$s1)]
153	fmul	dword [sp(%$t_c)]
154	faddp	st1, st0		;s1 c1
155
156	fld	st1
157	fmul	st0, st0		;c1c1 s1 c1
158	fld	st1
159	fmul	st0, st0		;s1s1 c1c1 s1 c1
160	fsubp	st1, st0		;c2 s1 c1
161	fstp	dword [sp(%$c2)]	;s1 c1
162
163	fld	st1			;c1 s1 c1
164	fmul	st0, st1		;c1s1 s1 c1
165	fadd	st0, st0		;s2 s1 c1
166	fstp	dword [sp(%$s2)]	;s1 c1
167
168	fstp	dword [sp(%$s1)]	;c1
169	fstp	dword [sp(%$c1)]	;
170
171	mov	r0, [sp(%$fz)]
172	add	r0, r5		;r0 = fi
173	mov	r1, [sp(%$fz)]
174	add	r1, r3
175	sub	r1, r5		;r1 = gi
176
177.do3:
178	fld	dword [sp(%$s2)]
179	fmul	dword [r0+r3]
180	fld	dword [sp(%$c2)]
181	fmul	dword [r1+r3]
182	fsubp	st1, st0		;b = s2*fi[k1] - c2*gi[k1]
183
184	fld	dword [sp(%$c2)]
185	fmul	dword [r0+r3]
186	fld	dword [sp(%$s2)]
187	fmul	dword [r1+r3]
188	faddp	st1, st0		;a = c2*fi[k1] + s2*gi[k1]  b
189
190	fld	dword [r0]
191	fsub	st0, st1		;f1 a b
192	fstp	dword [sp(%$f1)]	;a b
193
194	fadd	dword [r0]		;f0 b
195	fstp	dword [sp(%$f0)]	;b
196
197	fld	dword [r1]
198	fsub	st0, st1		;g1 b
199	fstp	dword [sp(%$g1)]	;b
200
201	fadd	dword [r1]		;g0
202	fstp	dword [sp(%$g0)]	;
203
204
205	fld	dword [sp(%$s2)]
206	fmul	dword [r0+r2]
207	fld	dword [sp(%$c2)]
208	fmul	dword [r1+r2]
209	fsubp	st1, st0		;b = s2*fi[k3] - c2*gi[k3]
210
211	fld	dword [sp(%$c2)]
212	fmul	dword [r0+r2]
213	fld	dword [sp(%$s2)]
214	fmul	dword [r1+r2]
215	faddp	st1, st0		;a = c2*fi[k3] + s2*gi[k3]  b
216
217	fld	dword [r0+r3*2]
218	fsub	st0, st1		;f3 a b
219	fstp	dword [sp(%$f3)]	;a b
220
221	fadd	dword [r0+r3*2]	;f2 b
222	fstp	dword [sp(%$f2)]	;b
223
224	fld	dword [r1+r3*2]
225	fsub	st0, st1		;g3 b
226	fstp	dword [sp(%$g3)]	;b
227
228	fadd	dword [r1+r3*2]	;g2
229	fstp	dword [sp(%$g2)]	;
230
231
232	fld	dword [sp(%$s1)]
233	fmul	dword [sp(%$f2)]
234	fld	dword [sp(%$c1)]
235	fmul	dword [sp(%$g3)]
236	fsubp	st1, st0		;b = s1*f2 - c1*g3
237
238	fld	dword [sp(%$c1)]
239	fmul	dword [sp(%$f2)]
240	fld	dword [sp(%$s1)]
241	fmul	dword [sp(%$g3)]
242	faddp	st1, st0		;a = c1*f2 + s1*g3  b
243
244	fld	dword [sp(%$f0)]
245	fsub	st0, st1		;fi[k2] a b
246	fstp	dword [r0+r3*2]
247
248	fadd	dword [sp(%$f0)]	;fi[0] b
249	fstp	dword [r0]
250
251	fld	dword [sp(%$g1)]
252	fsub	st0, st1		;gi[k3] b
253	fstp	dword [r1+r2]
254
255	fadd	dword [sp(%$g1)]	;gi[k1]
256	fstp	dword [r1+r3]
257
258
259	fld	dword [sp(%$c1)]
260	fmul	dword [sp(%$g2)]
261	fld	dword [sp(%$s1)]
262	fmul	dword [sp(%$f3)]
263	fsubp	st1, st0		;b = c1*g2 - s1*f3
264
265	fld	dword [sp(%$s1)]
266	fmul	dword [sp(%$g2)]
267	fld	dword [sp(%$c1)]
268	fmul	dword [sp(%$f3)]
269	faddp	st1, st0		;a = s1*g2 + c1*f3  b
270
271	fld	dword [sp(%$g0)]
272	fsub	st0, st1		;gi[k2] a b
273	fstp	dword [r1+r3*2]
274
275	fadd	dword [sp(%$g0)]	;gi[0] b
276	fstp	dword [r1]
277
278	fld	dword [sp(%$f1)]
279	fsub	st0, st1		;fi[k3] b
280	fstp	dword [r0+r2]
281
282	fadd	dword [sp(%$f1)]	;fi[k1]
283	fstp	dword [r0+r3]
284
285
286	lea	r0, [r0+r3*4]
287	lea	r1, [r1+r3*4]
288	cmp	r0, r6
289	jb near	.do3
290
291	add	r5, 4
292	cmp	r5, r4
293	jb near	.for
294
295	cmp	r3, [sp(%$n)]
296	jae	.exit
297
298	add	dword [sp(%$k)], 2	;k  += 2;
299	lea	r3, [r3*4]		;k1 *= 4
300	lea	r2, [r2*4]		;k3 *= 4
301	lea	r4, [r4*4]		;kx *= 4
302	mov	r0, [sp(%$fz)]	;fi
303	lea	r1, [r0+r4]		;gi = fi + kx
304	jmp	.do
305
306.exit:
307	popd	ebp, ebx, esi, edi
308endproc
309
310;*************************************************************
311
312;void fht_FPU_FXCH(float *fz, int n);
313proc	fht_FPU_FXCH
314
315%$fz	arg	4
316%$n	arg	4
317
318%$k	local	4
319
320%$f0	local	4
321%$f1	local	4
322%$f2	local	4
323%$f3	local	4
324
325%$g0	local	4
326%$g1	local	4
327%$g2	local	4
328%$g3	local	4
329
330%$s1	local	4
331%$c1	local	4
332%$s2	local	4
333%$c2	local	4
334
335%$t_s	local	4
336%$t_c	local	4
337	alloc
338
339	pushd	ebp, ebx, esi, edi
340
341fht_FPU_FXCH_1st_part:
342
343fht_FPU_FXCH_2nd_part:
344
345fht_FPU_FXCH_3rd_part:
346
347.do_init:
348	mov	r3, 16		;k1*fsize = 4*fsize = k4
349	mov	r4, 8		;kx = k1/2
350	mov	r2, 48		;k3*fsize
351	mov	dword [sp(%$k)], 2	;k = 2
352	mov	r0, [sp(%$fz)]	;fi
353	lea	r1, [r0+8]		;gi = fi + kx
354
355.do:
356.do2:
357	;f
358	fld	dword [r0]
359	fsub	dword [r0+r3]
360	fld	dword [r0]
361	fadd	dword [r0+r3]
362
363	fld	dword [r0+r3*2]
364	fsub	dword [r0+r2]
365	fld	dword [r0+r3*2]
366	fadd	dword [r0+r2]		;f2 f3 f0 f1
367
368	fld	st3
369	fld	st3
370	fxch	st5
371	fadd	st0, st3
372	fxch	st4
373	fadd	st0, st2
374	fxch	st3
375	fsubp	st1, st0
376	fxch	st1
377	fsubp	st4, st0
378	fxch	st2
379
380	fstp	dword [r0+r3]		;fi[k1]
381	fstp	dword [r0]		;fi[0]
382	fstp	dword [r0+r2]		;fi[k3]
383	fstp	dword [r0+r3*2]		;fi[k2]
384
385	;g
386	fld	dword [r1]
387	fsub	dword [r1+r3]
388	fld	dword [r1]
389	fadd	dword [r1+r3]
390
391	fld	dword [D_1_41421]
392	fmul	dword [r1+r2]
393	fld	dword [D_1_41421]
394	fmul	dword [r1+r3*2]		;g2 g3 g0 g1
395
396	fld	st3
397	fld	st3
398	fxch	st5
399	fadd	st0, st3
400	fxch	st4
401	fadd	st0, st2
402	fxch	st3
403	fsubp	st1, st0
404	fxch	st1
405	fsubp	st4, st0
406	fxch	st2
407
408	fstp	dword [r1+r3]		;gi[k1]
409	fstp	dword [r1]		;gi[0]
410	fstp	dword [r1+r2]		;gi[k3]
411	fstp	dword [r1+r3*2]		;gi[k2]
412
413	lea	r0, [r0+r3*4]
414	lea	r1, [r1+r3*4]
415	cmp	r0, r6
416	jb	.do2
417
418
419	mov	r0, [sp(%$k)]
420	fld	dword [costab_fft +r0*4]
421	fld	dword [sintab_fft +r0*4]
422	fld	dword [D_1_0]
423	fld	dword [D_0_0]
424	fxch	st3
425	fstp	dword [sp(%$t_c)]
426	fxch	st1
427	fstp	dword [sp(%$t_s)]
428	fstp	dword [sp(%$c1)]
429	fstp	dword [sp(%$s1)]
430
431.for_init:
432	mov	r5, 4		;i = 1*fsize
433
434.for:
435	fld	dword [sp(%$c1)]
436	fmul	dword [sp(%$t_c)]
437	fld	dword [sp(%$s1)]
438	fmul	dword [sp(%$t_s)]
439
440	fld	dword [sp(%$c1)]
441	fmul	dword [sp(%$t_s)]
442	fld	dword [sp(%$s1)]
443	fmul	dword [sp(%$t_c)]
444	fxch	st2
445	fsubp	st3, st0		;c1
446	faddp	st1, st0		;s1 c1
447
448	fld	st1
449	fxch	st2
450	fmul	st0, st0		;c1c1 s1 c1
451	fld	st1
452	fxch	st2
453	fmul	st0, st0		;s1s1 c1c1 s1 c1
454
455	fxch	st3
456	fst	dword [sp(%$c1)]	;c1
457	fxch	st2
458	fst	dword [sp(%$s1)]	;s1 c1c1 c1 s1s1
459
460	fmulp	st2, st0
461	fsubrp	st2, st0
462	fadd	st0, st0		;s2 c2
463	fxch	st1
464	fstp	dword [sp(%$c2)]
465	fstp	dword [sp(%$s2)]
466
467	mov	r0, [sp(%$fz)]
468	mov	r1, [sp(%$fz)]
469	add	r0, r5		;r0 = fi
470	add	r1, r3
471	sub	r1, r5		;r1 = gi
472
473.do3:
474	fld	dword [sp(%$s2)]
475	fmul	dword [r0+r3]
476	fld	dword [sp(%$c2)]
477	fmul	dword [r1+r3]
478
479	fld	dword [sp(%$c2)]
480	fmul	dword [r0+r3]
481	fld	dword [sp(%$s2)]
482	fmul	dword [r1+r3]
483	fxch	st2
484	fsubp	st3, st0		;b = s2*fi[k1] - c2*gi[k1]
485	faddp	st1, st0		;a = c2*fi[k1] + s2*gi[k1]  b
486
487	fld	dword [r1]
488	fsub	st0, st2		;g1 a b
489	fxch	st2
490	fadd	dword [r1]		;g0 a g1
491
492	fld	dword [r0]
493	fsub	st0, st2		;f1 g0 a g1
494	fxch	st2
495	fadd	dword [r0]		;f0 g0 f1 g1
496
497	fxch	st3
498	fstp	dword [sp(%$g1)]
499	fstp	dword [sp(%$g0)]
500	fstp	dword [sp(%$f1)]
501	fstp	dword [sp(%$f0)]
502
503
504	fld	dword [sp(%$s2)]
505	fmul	dword [r0+r2]
506	fld	dword [sp(%$c2)]
507	fmul	dword [r1+r2]
508
509	fld	dword [sp(%$c2)]
510	fmul	dword [r0+r2]
511	fld	dword [sp(%$s2)]
512	fmul	dword [r1+r2]
513	fxch	st2
514	fsubp	st3, st0		;b = s2*fi[k3] - c2*gi[k3]
515	faddp	st1, st0		;a = c2*fi[k3] + s2*gi[k3]  b
516
517
518	fld	dword [r1+r3*2]
519	fsub	st0, st2		;g3 a b
520	fxch	st2
521	fadd	dword [r1+r3*2]	;g2 a g3
522
523	fld	dword [r0+r3*2]
524	fsub	st0, st2		;f3 g2 a g3
525	fxch	st2
526	fadd	dword [r0+r3*2]	;f2 g2 f3 g3
527
528	fxch	st3
529	fstp	dword [sp(%$g3)]
530	fstp	dword [sp(%$g2)]
531	fstp	dword [sp(%$f3)]
532	fstp	dword [sp(%$f2)]
533
534
535	fld	dword [sp(%$s1)]
536	fmul	dword [sp(%$f2)]
537	fld	dword [sp(%$c1)]
538	fmul	dword [sp(%$g3)]
539
540	fld	dword [sp(%$c1)]
541	fmul	dword [sp(%$f2)]
542	fld	dword [sp(%$s1)]
543	fmul	dword [sp(%$g3)]
544	fxch	st2
545	fsubp	st3, st0		;b = s1*f2 - c1*g3
546	faddp	st1, st0		;a = c1*f2 + s1*g3  b
547
548	fld	dword [sp(%$g1)]
549	fsub	st0, st2		;gi[k3] a b
550	fxch	st2
551	fadd	dword [sp(%$g1)]	;gi[k1] a gi[k3]
552
553	fld	dword [sp(%$f0)]
554	fsub	st0, st2		;fi[k2] gi[k1] a gi[k3]
555	fxch	st2
556	fadd	dword [sp(%$f0)]	;fi[0] gi[k1] fi[k2] gi[k3]
557
558	fxch	st3
559	fstp	dword [r1+r2]
560	fstp	dword [r1+r3]
561	fstp	dword [r0+r3*2]
562	fstp	dword [r0]
563
564
565	fld	dword [sp(%$c1)]
566	fmul	dword [sp(%$g2)]
567	fld	dword [sp(%$s1)]
568	fmul	dword [sp(%$f3)]
569
570	fld	dword [sp(%$s1)]
571	fmul	dword [sp(%$g2)]
572	fld	dword [sp(%$c1)]
573	fmul	dword [sp(%$f3)]
574	fxch	st2
575	fsubp	st3, st0		;b = c1*g2 - s1*f3
576	faddp	st1, st0		;a = s1*g2 + c1*f3  b
577
578	fld	dword [sp(%$f1)]
579	fsub	st0, st2		;fi[k3] a b
580	fxch	st2
581	fadd	dword [sp(%$f1)]	;fi[k1] a fi[k3]
582
583	fld	dword [sp(%$g0)]
584	fsub	st0, st2		;gi[k2] fi[k1] a fi[k3]
585	fxch	st2
586	fadd	dword [sp(%$g0)]	;gi[0] fi[k1] gi[k2] fi[k3]
587
588	fxch	st3
589	fstp	dword [r0+r2]
590	fstp	dword [r0+r3]
591	fstp	dword [r1+r3*2]
592	fstp	dword [r1]
593
594
595	lea	r0, [r0+r3*4]
596	lea	r1, [r1+r3*4]
597	cmp	r0, r6
598	jb near	.do3
599
600	add	r5, 4
601	cmp	r5, r4
602	jb near	.for
603
604	cmp	r3, [sp(%$n)]
605	jae	.exit
606
607	add	dword [sp(%$k)], 2	;k  += 2;
608	lea	r3, [r3*4]		;k1 *= 4
609	lea	r2, [r2*4]		;k3 *= 4
610	lea	r4, [r4*4]		;kx *= 4
611	mov	r0, [sp(%$fz)]	;fi
612	lea	r1, [r0+r4]		;gi = fi + kx
613	jmp	.do
614
615.exit:
616	popd	ebp, ebx, esi, edi
617endproc
618
619	end
620