1 #include "config.h"
2 #include "libbench2/bench.h"
3 #include <math.h>
4
5 #define DG unsigned short
6 #define ACC unsigned long
7 #define REAL bench_real
8 #define BITS_IN_REAL 53 /* mantissa */
9
10 #define SHFT 16
11 #define RADIX 65536L
12 #define IRADIX (1.0 / RADIX)
13 #define LO(x) ((x) & (RADIX - 1))
14 #define HI(x) ((x) >> SHFT)
15 #define HI_SIGNED(x) \
16 ((((x) + (ACC)(RADIX >> 1) * RADIX) >> SHFT) - (RADIX >> 1))
17 #define ZEROEXP (-32768)
18
19 #define LEN 10
20
21 typedef struct {
22 short sign;
23 short expt;
24 DG d[LEN];
25 } N[1];
26
27 #define EXA a->expt
28 #define EXB b->expt
29 #define EXC c->expt
30
31 #define AD a->d
32 #define BD b->d
33
34 #define SGNA a->sign
35 #define SGNB b->sign
36
37 static const N zero = {{ 1, ZEROEXP, {0} }};
38
cpy(const N a,N b)39 static void cpy(const N a, N b)
40 {
41 *b = *a;
42 }
43
fromreal(REAL x,N a)44 static void fromreal(REAL x, N a)
45 {
46 int i, e;
47
48 cpy(zero, a);
49 if (x == 0.0) return;
50
51 if (x >= 0) { SGNA = 1; }
52 else { SGNA = -1; x = -x; }
53
54 e = 0;
55 while (x >= 1.0) { x *= IRADIX; ++e; }
56 while (x < IRADIX) { x *= RADIX; --e; }
57 EXA = e;
58
59 for (i = LEN - 1; i >= 0 && x != 0.0; --i) {
60 REAL y;
61
62 x *= RADIX;
63 y = (REAL) ((int) x);
64 AD[i] = (DG)y;
65 x -= y;
66 }
67 }
68
fromshort(int x,N a)69 static void fromshort(int x, N a)
70 {
71 cpy(zero, a);
72
73 if (x < 0) { x = -x; SGNA = -1; }
74 else { SGNA = 1; }
75 EXA = 1;
76 AD[LEN - 1] = x;
77 }
78
pack(DG * d,int e,int s,int l,N a)79 static void pack(DG *d, int e, int s, int l, N a)
80 {
81 int i, j;
82
83 for (i = l - 1; i >= 0; --i, --e)
84 if (d[i] != 0)
85 break;
86
87 if (i < 0) {
88 /* number is zero */
89 cpy(zero, a);
90 } else {
91 EXA = e;
92 SGNA = s;
93
94 if (i >= LEN - 1) {
95 for (j = LEN - 1; j >= 0; --i, --j)
96 AD[j] = d[i];
97 } else {
98 for (j = LEN - 1; i >= 0; --i, --j)
99 AD[j] = d[i];
100 for ( ; j >= 0; --j)
101 AD[j] = 0;
102 }
103 }
104 }
105
106
107 /* compare absolute values */
abscmp(const N a,const N b)108 static int abscmp(const N a, const N b)
109 {
110 int i;
111 if (EXA > EXB) return 1;
112 if (EXA < EXB) return -1;
113 for (i = LEN - 1; i >= 0; --i) {
114 if (AD[i] > BD[i])
115 return 1;
116 if (AD[i] < BD[i])
117 return -1;
118 }
119 return 0;
120 }
121
eq(const N a,const N b)122 static int eq(const N a, const N b)
123 {
124 return (SGNA == SGNB) && (abscmp(a, b) == 0);
125 }
126
127 /* add magnitudes, for |a| >= |b| */
addmag0(int s,const N a,const N b,N c)128 static void addmag0(int s, const N a, const N b, N c)
129 {
130 int ia, ib;
131 ACC r = 0;
132 DG d[LEN + 1];
133
134 for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) {
135 r += (ACC)AD[ia] + (ACC)BD[ib];
136 d[ia] = LO(r);
137 r = HI(r);
138 }
139 for (; ia < LEN; ++ia) {
140 r += (ACC)AD[ia];
141 d[ia] = LO(r);
142 r = HI(r);
143 }
144 d[ia] = LO(r);
145 pack(d, EXA + 1, s * SGNA, LEN + 1, c);
146 }
147
addmag(int s,const N a,const N b,N c)148 static void addmag(int s, const N a, const N b, N c)
149 {
150 if (abscmp(a, b) > 0) addmag0(1, a, b, c); else addmag0(s, b, a, c);
151 }
152
153 /* subtract magnitudes, for |a| >= |b| */
submag0(int s,const N a,const N b,N c)154 static void submag0(int s, const N a, const N b, N c)
155 {
156 int ia, ib;
157 ACC r = 0;
158 DG d[LEN];
159
160 for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) {
161 r += (ACC)AD[ia] - (ACC)BD[ib];
162 d[ia] = LO(r);
163 r = HI_SIGNED(r);
164 }
165 for (; ia < LEN; ++ia) {
166 r += (ACC)AD[ia];
167 d[ia] = LO(r);
168 r = HI_SIGNED(r);
169 }
170
171 pack(d, EXA, s * SGNA, LEN, c);
172 }
173
submag(int s,const N a,const N b,N c)174 static void submag(int s, const N a, const N b, N c)
175 {
176 if (abscmp(a, b) > 0) submag0(1, a, b, c); else submag0(s, b, a, c);
177 }
178
179 /* c = a + b */
add(const N a,const N b,N c)180 static void add(const N a, const N b, N c)
181 {
182 if (SGNA == SGNB) addmag(1, a, b, c); else submag(1, a, b, c);
183 }
184
sub(const N a,const N b,N c)185 static void sub(const N a, const N b, N c)
186 {
187 if (SGNA == SGNB) submag(-1, a, b, c); else addmag(-1, a, b, c);
188 }
189
mul(const N a,const N b,N c)190 static void mul(const N a, const N b, N c)
191 {
192 DG d[2 * LEN];
193 int i, j, k;
194 ACC r;
195
196 for (i = 0; i < LEN; ++i)
197 d[2 * i] = d[2 * i + 1] = 0;
198
199 for (i = 0; i < LEN; ++i) {
200 ACC ai = AD[i];
201 if (ai) {
202 r = 0;
203 for (j = 0, k = i; j < LEN; ++j, ++k) {
204 r += ai * (ACC)BD[j] + (ACC)d[k];
205 d[k] = LO(r);
206 r = HI(r);
207 }
208 d[k] = LO(r);
209 }
210 }
211
212 pack(d, EXA + EXB, SGNA * SGNB, 2 * LEN, c);
213 }
214
toreal(const N a)215 static REAL toreal(const N a)
216 {
217 REAL h, l, f;
218 int i, bits;
219 ACC r;
220 DG sticky;
221
222 if (EXA != ZEROEXP) {
223 f = IRADIX;
224 i = LEN;
225
226 bits = 0;
227 h = (r = AD[--i]) * f; f *= IRADIX;
228 for (bits = 0; r > 0; ++bits)
229 r >>= 1;
230
231 /* first digit */
232 while (bits + SHFT <= BITS_IN_REAL) {
233 h += AD[--i] * f; f *= IRADIX; bits += SHFT;
234 }
235
236 /* guard digit (leave one bit for sticky bit, hence `<' instead
237 of `<=') */
238 bits = 0; l = 0.0;
239 while (bits + SHFT < BITS_IN_REAL) {
240 l += AD[--i] * f; f *= IRADIX; bits += SHFT;
241 }
242
243 /* sticky bit */
244 sticky = 0;
245 while (i > 0)
246 sticky |= AD[--i];
247
248 if (sticky)
249 l += (RADIX / 2) * f;
250
251 h += l;
252
253 for (i = 0; i < EXA; ++i) h *= (REAL)RADIX;
254 for (i = 0; i > EXA; --i) h *= IRADIX;
255 if (SGNA == -1) h = -h;
256 return h;
257 } else {
258 return 0.0;
259 }
260 }
261
neg(N a)262 static void neg(N a)
263 {
264 SGNA = -SGNA;
265 }
266
inv(const N a,N x)267 static void inv(const N a, N x)
268 {
269 N w, z, one, two;
270
271 fromreal(1.0 / toreal(a), x); /* initial guess */
272 fromshort(1, one);
273 fromshort(2, two);
274
275 for (;;) {
276 /* Newton */
277 mul(a, x, w);
278 sub(two, w, z);
279 if (eq(one, z)) break;
280 mul(x, z, x);
281 }
282 }
283
284
285 /* 2 pi */
286 static const N n2pi = {{
287 1, 1,
288 {18450, 59017, 1760, 5212, 9779, 4518, 2886, 54545, 18558, 6}
289 }};
290
291 /* 1 / 31! */
292 static const N i31fac = {{
293 1, -7,
294 {28087, 45433, 51357, 24545, 14291, 3954, 57879, 8109, 38716, 41382}
295 }};
296
297
298 /* 1 / 32! */
299 static const N i32fac = {{
300 1, -7,
301 {52078, 60811, 3652, 39679, 37310, 47227, 28432, 57597, 13497, 1293}
302 }};
303
msin(const N a,N b)304 static void msin(const N a, N b)
305 {
306 N a2, g, k;
307 int i;
308
309 cpy(i31fac, g);
310 cpy(g, b);
311 mul(a, a, a2);
312
313 /* Taylor */
314 for (i = 31; i > 1; i -= 2) {
315 fromshort(i * (i - 1), k);
316 mul(k, g, g);
317 mul(a2, b, k);
318 sub(g, k, b);
319 }
320 mul(a, b, b);
321 }
322
mcos(const N a,N b)323 static void mcos(const N a, N b)
324 {
325 N a2, g, k;
326 int i;
327
328 cpy(i32fac, g);
329 cpy(g, b);
330 mul(a, a, a2);
331
332 /* Taylor */
333 for (i = 32; i > 0; i -= 2) {
334 fromshort(i * (i - 1), k);
335 mul(k, g, g);
336 mul(a2, b, k);
337 sub(g, k, b);
338 }
339 }
340
by2pi(REAL m,REAL n,N a)341 static void by2pi(REAL m, REAL n, N a)
342 {
343 N b;
344
345 fromreal(n, b);
346 inv(b, a);
347 fromreal(m, b);
348 mul(a, b, a);
349 mul(n2pi, a, a);
350 }
351
352 static void sin2pi(REAL m, REAL n, N a);
cos2pi(REAL m,REAL n,N a)353 static void cos2pi(REAL m, REAL n, N a)
354 {
355 N b;
356 if (m < 0) cos2pi(-m, n, a);
357 else if (m > n * 0.5) cos2pi(n - m, n, a);
358 else if (m > n * 0.25) {sin2pi(m - n * 0.25, n, a); neg(a);}
359 else if (m > n * 0.125) sin2pi(n * 0.25 - m, n, a);
360 else { by2pi(m, n, b); mcos(b, a); }
361 }
362
sin2pi(REAL m,REAL n,N a)363 static void sin2pi(REAL m, REAL n, N a)
364 {
365 N b;
366 if (m < 0) {sin2pi(-m, n, a); neg(a);}
367 else if (m > n * 0.5) {sin2pi(n - m, n, a); neg(a);}
368 else if (m > n * 0.25) {cos2pi(m - n * 0.25, n, a);}
369 else if (m > n * 0.125) {cos2pi(n * 0.25 - m, n, a);}
370 else {by2pi(m, n, b); msin(b, a);}
371 }
372
373 /*----------------------------------------------------------------------*/
374 /* FFT stuff */
375
376 /* (r0 + i i0)(r1 + i i1) */
cmul(N r0,N i0,N r1,N i1,N r2,N i2)377 static void cmul(N r0, N i0, N r1, N i1, N r2, N i2)
378 {
379 N s, t, q;
380 mul(r0, r1, s);
381 mul(i0, i1, t);
382 sub(s, t, q);
383 mul(r0, i1, s);
384 mul(i0, r1, t);
385 add(s, t, i2);
386 cpy(q, r2);
387 }
388
389 /* (r0 - i i0)(r1 + i i1) */
cmulj(N r0,N i0,N r1,N i1,N r2,N i2)390 static void cmulj(N r0, N i0, N r1, N i1, N r2, N i2)
391 {
392 N s, t, q;
393 mul(r0, r1, s);
394 mul(i0, i1, t);
395 add(s, t, q);
396 mul(r0, i1, s);
397 mul(i0, r1, t);
398 sub(s, t, i2);
399 cpy(q, r2);
400 }
401
mcexp(int m,int n,N r,N i)402 static void mcexp(int m, int n, N r, N i)
403 {
404 static int cached_n = -1;
405 static N w[64][2];
406 int k, j;
407 if (n != cached_n) {
408 for (j = 1, k = 0; j < n; j += j, ++k) {
409 cos2pi(j, n, w[k][0]);
410 sin2pi(j, n, w[k][1]);
411 }
412 cached_n = n;
413 }
414
415 fromshort(1, r);
416 fromshort(0, i);
417 if (m > 0) {
418 for (k = 0; m; ++k, m >>= 1)
419 if (m & 1)
420 cmul(w[k][0], w[k][1], r, i, r, i);
421 } else {
422 m = -m;
423 for (k = 0; m; ++k, m >>= 1)
424 if (m & 1)
425 cmulj(w[k][0], w[k][1], r, i, r, i);
426 }
427 }
428
bitrev(int n,N * a)429 static void bitrev(int n, N *a)
430 {
431 int i, j, m;
432 for (i = j = 0; i < n - 1; ++i) {
433 if (i < j) {
434 N t;
435 cpy(a[2*i], t); cpy(a[2*j], a[2*i]); cpy(t, a[2*j]);
436 cpy(a[2*i+1], t); cpy(a[2*j+1], a[2*i+1]); cpy(t, a[2*j+1]);
437 }
438
439 /* bit reversed counter */
440 m = n; do { m >>= 1; j ^= m; } while (!(j & m));
441 }
442 }
443
fft0(int n,N * a,int sign)444 static void fft0(int n, N *a, int sign)
445 {
446 int i, j, k;
447
448 bitrev(n, a);
449 for (i = 1; i < n; i = 2 * i) {
450 for (j = 0; j < i; ++j) {
451 N wr, wi;
452 mcexp(sign * (int)j, 2 * i, wr, wi);
453 for (k = j; k < n; k += 2 * i) {
454 N *a0 = a + 2 * k;
455 N *a1 = a0 + 2 * i;
456 N r0, i0, r1, i1, t0, t1, xr, xi;
457 cpy(a0[0], r0); cpy(a0[1], i0);
458 cpy(a1[0], r1); cpy(a1[1], i1);
459 mul(r1, wr, t0); mul(i1, wi, t1); sub(t0, t1, xr);
460 mul(r1, wi, t0); mul(i1, wr, t1); add(t0, t1, xi);
461 add(r0, xr, a0[0]); add(i0, xi, a0[1]);
462 sub(r0, xr, a1[0]); sub(i0, xi, a1[1]);
463 }
464 }
465 }
466 }
467
468 /* a[2*k]+i*a[2*k+1] = exp(2*pi*i*k^2/(2*n)) */
bluestein_sequence(int n,N * a)469 static void bluestein_sequence(int n, N *a)
470 {
471 int k, ksq, n2 = 2 * n;
472
473 ksq = 1; /* (-1)^2 */
474 for (k = 0; k < n; ++k) {
475 /* careful with overflow */
476 ksq = ksq + 2*k - 1; while (ksq > n2) ksq -= n2;
477 mcexp(ksq, n2, a[2*k], a[2*k+1]);
478 }
479 }
480
pow2_atleast(int x)481 static int pow2_atleast(int x)
482 {
483 int h;
484 for (h = 1; h < x; h = 2 * h)
485 ;
486 return h;
487 }
488
489 static N *cached_bluestein_w = 0;
490 static N *cached_bluestein_y = 0;
491 static int cached_bluestein_n = -1;
492
bluestein(int n,N * a)493 static void bluestein(int n, N *a)
494 {
495 int nb = pow2_atleast(2 * n);
496 N *b = (N *)bench_malloc(2 * nb * sizeof(N));
497 N *w = cached_bluestein_w;
498 N *y = cached_bluestein_y;
499 N nbinv;
500 int i;
501
502 fromreal(1.0 / nb, nbinv); /* exact because nb = 2^k */
503
504 if (cached_bluestein_n != n) {
505 if (w) bench_free(w);
506 if (y) bench_free(y);
507 w = (N *)bench_malloc(2 * n * sizeof(N));
508 y = (N *)bench_malloc(2 * nb * sizeof(N));
509 cached_bluestein_n = n;
510 cached_bluestein_w = w;
511 cached_bluestein_y = y;
512
513 bluestein_sequence(n, w);
514 for (i = 0; i < 2*nb; ++i) cpy(zero, y[i]);
515
516 for (i = 0; i < n; ++i) {
517 cpy(w[2*i], y[2*i]);
518 cpy(w[2*i+1], y[2*i+1]);
519 }
520 for (i = 1; i < n; ++i) {
521 cpy(w[2*i], y[2*(nb-i)]);
522 cpy(w[2*i+1], y[2*(nb-i)+1]);
523 }
524
525 fft0(nb, y, -1);
526 }
527
528 for (i = 0; i < 2*nb; ++i) cpy(zero, b[i]);
529
530 for (i = 0; i < n; ++i)
531 cmulj(w[2*i], w[2*i+1], a[2*i], a[2*i+1], b[2*i], b[2*i+1]);
532
533 /* scaled convolution b * y */
534 fft0(nb, b, -1);
535
536 for (i = 0; i < nb; ++i)
537 cmul(b[2*i], b[2*i+1], y[2*i], y[2*i+1], b[2*i], b[2*i+1]);
538 fft0(nb, b, 1);
539
540 for (i = 0; i < n; ++i) {
541 cmulj(w[2*i], w[2*i+1], b[2*i], b[2*i+1], a[2*i], a[2*i+1]);
542 mul(nbinv, a[2*i], a[2*i]);
543 mul(nbinv, a[2*i+1], a[2*i+1]);
544 }
545
546 bench_free(b);
547 }
548
swapri(int n,N * a)549 static void swapri(int n, N *a)
550 {
551 int i;
552 for (i = 0; i < n; ++i) {
553 N t;
554 cpy(a[2 * i], t);
555 cpy(a[2 * i + 1], a[2 * i]);
556 cpy(t, a[2 * i + 1]);
557 }
558 }
559
fft1(int n,N * a,int sign)560 static void fft1(int n, N *a, int sign)
561 {
562 if (power_of_two(n)) {
563 fft0(n, a, sign);
564 } else {
565 if (sign == 1) swapri(n, a);
566 bluestein(n, a);
567 if (sign == 1) swapri(n, a);
568 }
569 }
570
fromrealv(int n,bench_complex * a,N * b)571 static void fromrealv(int n, bench_complex *a, N *b)
572 {
573 int i;
574
575 for (i = 0; i < n; ++i) {
576 fromreal(c_re(a[i]), b[2 * i]);
577 fromreal(c_im(a[i]), b[2 * i + 1]);
578 }
579 }
580
compare(int n,N * a,N * b,double * err)581 static void compare(int n, N *a, N *b, double *err)
582 {
583 int i;
584 double e1, e2, einf;
585 double n1, n2, ninf;
586
587 e1 = e2 = einf = 0.0;
588 n1 = n2 = ninf = 0.0;
589
590 # define DO(x1, x2, xinf, var) { \
591 double d = var; \
592 if (d < 0) d = -d; \
593 x1 += d; x2 += d * d; if (d > xinf) xinf = d; \
594 }
595
596 for (i = 0; i < 2 * n; ++i) {
597 N dd;
598 sub(a[i], b[i], dd);
599 DO(n1, n2, ninf, toreal(a[i]));
600 DO(e1, e2, einf, toreal(dd));
601 }
602
603 # undef DO
604 err[0] = e1 / n1;
605 err[1] = sqrt(e2 / n2);
606 err[2] = einf / ninf;
607 }
608
fftaccuracy(int n,bench_complex * a,bench_complex * ffta,int sign,double err[6])609 void fftaccuracy(int n, bench_complex *a, bench_complex *ffta,
610 int sign, double err[6])
611 {
612 N *b = (N *)bench_malloc(2 * n * sizeof(N));
613 N *fftb = (N *)bench_malloc(2 * n * sizeof(N));
614 N mn, ninv;
615 int i;
616
617 fromreal(n, mn); inv(mn, ninv);
618
619 /* forward error */
620 fromrealv(n, a, b); fromrealv(n, ffta, fftb);
621 fft1(n, b, sign);
622 compare(n, b, fftb, err);
623
624 /* backward error */
625 fromrealv(n, a, b); fromrealv(n, ffta, fftb);
626 for (i = 0; i < 2 * n; ++i) mul(fftb[i], ninv, fftb[i]);
627 fft1(n, fftb, -sign);
628 compare(n, b, fftb, err + 3);
629
630 bench_free(fftb);
631 bench_free(b);
632 }
633
fftaccuracy_done(void)634 void fftaccuracy_done(void)
635 {
636 if (cached_bluestein_w) bench_free(cached_bluestein_w);
637 if (cached_bluestein_y) bench_free(cached_bluestein_y);
638 cached_bluestein_w = 0;
639 cached_bluestein_y = 0;
640 cached_bluestein_n = -1;
641 }
642