1 /*
2     Copyright (C) 2017 Fredrik Johansson
3 
4     This file is part of Arb.
5 
6     Arb is free software: you can redistribute it and/or modify it under
7     the terms of the GNU Lesser General Public License (LGPL) as published
8     by the Free Software Foundation; either version 2.1 of the License, or
9     (at your option) any later version.  See <http://www.gnu.org/licenses/>.
10 */
11 
12 #include "arb.h"
13 
14 slong _arb_compute_bs_exponents(slong * tab, slong n);
15 slong _arb_get_exp_pos(const slong * tab, slong step);
16 
17 static void
bsplit(fmpz_t T,fmpz_t Q,flint_bitcnt_t * Qexp,const slong * xexp,const fmpz * xpow,flint_bitcnt_t r,slong a,slong b)18 bsplit(fmpz_t T, fmpz_t Q, flint_bitcnt_t * Qexp,
19     const slong * xexp,
20     const fmpz * xpow, flint_bitcnt_t r, slong a, slong b)
21 {
22     int cc;
23 
24     if (b - a == 1)
25     {
26         count_trailing_zeros(cc, (2 * a + 2));
27         fmpz_neg_ui(Q, (2 * a + 2) >> cc);
28         fmpz_mul_ui(Q, Q, 2 * a + 3);
29         *Qexp = 2 * r + cc;
30 
31         fmpz_set(T, xpow);
32     }
33     else if (b - a == 2)
34     {
35         fmpz_mul2_uiui(T, xpow, (2 * a + 4), (2 * a + 5));
36         fmpz_mul_2exp(T, T, 2 * r);
37         fmpz_neg(T, T);
38         fmpz_add(T, T, xpow + 1);
39 
40         count_trailing_zeros(cc, (2 * a + 4));
41         fmpz_neg_ui(Q, (2 * a + 4) >> cc);
42         fmpz_mul_ui(Q, Q, 2 * a + 5);
43         *Qexp = 2 * r + cc;
44 
45         count_trailing_zeros(cc, (2 * a + 2));
46         fmpz_mul2_uiui(Q, Q, (2 * a + 2) >> cc, (2 * a + 3));
47         fmpz_neg(Q, Q);
48         *Qexp += 2 * r + cc;
49     }
50     else
51     {
52         slong step, m, i;
53         flint_bitcnt_t Q2exp[1];
54         fmpz_t Q2, T2;
55 
56         step = (b - a) / 2;
57         m = a + step;
58 
59         fmpz_init(Q2);
60         fmpz_init(T2);
61 
62         bsplit(T,  Q,  Qexp,  xexp, xpow, r, a, m);
63         bsplit(T2, Q2, Q2exp, xexp, xpow, r, m, b);
64 
65         fmpz_mul(T, T, Q2);
66         fmpz_mul_2exp(T, T, *Q2exp);
67 
68         /* find x^step in table */
69         i = _arb_get_exp_pos(xexp, step);
70         fmpz_addmul(T, xpow + i, T2);
71         fmpz_clear(T2);
72 
73         fmpz_mul(Q, Q, Q2);
74         *Qexp = *Qexp + *Q2exp;
75         fmpz_clear(Q2);
76     }
77 }
78 
79 /* todo: also allow computing cos, using the same table... */
80 void
_arb_sin_sum_bs_powtab(fmpz_t T,fmpz_t Q,flint_bitcnt_t * Qexp,const fmpz_t x,flint_bitcnt_t r,slong N)81 _arb_sin_sum_bs_powtab(fmpz_t T, fmpz_t Q, flint_bitcnt_t * Qexp,
82     const fmpz_t x, flint_bitcnt_t r, slong N)
83 {
84     slong * xexp;
85     slong length, i;
86     fmpz * xpow;
87 
88     /* compute the powers of x^2 that will appear (at least x^2) */
89     xexp = flint_calloc(2 * FLINT_BITS, sizeof(slong));
90     length = _arb_compute_bs_exponents(xexp, N);
91 
92     xpow = _fmpz_vec_init(length);
93     fmpz_mul(xpow, x, x);
94 
95     /* build x^i table */
96     for (i = 1; i < length; i++)
97     {
98         if (xexp[i] == 2 * xexp[i-1])
99         {
100             fmpz_mul(xpow + i, xpow + i - 1, xpow + i - 1);
101         }
102         else if (xexp[i] == 2 * xexp[i-2]) /* prefer squaring if possible */
103         {
104             fmpz_mul(xpow + i, xpow + i - 2, xpow + i - 2);
105         }
106         else if (xexp[i] == 2 * xexp[i-1] + 1)
107         {
108             fmpz_mul(xpow + i, xpow + i - 1, xpow + i - 1);
109             fmpz_mul(xpow + i, xpow + i, xpow);
110         }
111         else if (xexp[i] == 2 * xexp[i-2] + 1)
112         {
113             fmpz_mul(xpow + i, xpow + i - 2, xpow + i - 2);
114             fmpz_mul(xpow + i, xpow + i, xpow);
115         }
116         else
117         {
118             flint_printf("power table has the wrong structure!\n");
119             flint_abort();
120         }
121     }
122 
123     bsplit(T, Q, Qexp, xexp, xpow, r, 0, N);
124     _fmpz_vec_clear(xpow, length);
125     flint_free(xexp);
126 }
127 
128 /*
129 Determine N such that the error is bounded by 2^-prec when summing the
130 Taylor series of sin(x) up to term x^(2N+1) inclusive. We choose an N with
131 many trailing zeros to improve efficiency of the binary splitting.
132 */
133 static slong
bs_num_terms(slong mag,slong prec)134 bs_num_terms(slong mag, slong prec)
135 {
136     slong N;
137 
138     N = _arb_exp_taylor_bound(mag, prec);
139     N = N / 2 - 1;
140     N = FLINT_MAX(N, 1);
141 
142     if (N > 10000)
143         while (N % 128 != 0)
144             N++;
145 
146     if (N > 1000)
147         while (N % 16 != 0)
148             N++;
149 
150     if (N > 100)
151         while (N % 2 != 0)
152             N++;
153 
154     return N;
155 }
156 
157 void
arb_sin_cos_fmpz_div_2exp_bsplit(arb_t wsin,arb_t wcos,const fmpz_t x,flint_bitcnt_t r,slong prec)158 arb_sin_cos_fmpz_div_2exp_bsplit(arb_t wsin, arb_t wcos, const fmpz_t x, flint_bitcnt_t r, slong prec)
159 {
160     fmpz_t T, Q;
161     slong N, xmag;
162     flint_bitcnt_t Qexp[1];
163 
164     /* slightly reduce memory usage at very high precision */
165     arb_zero(wsin);
166     arb_zero(wcos);
167 
168     fmpz_init(T);
169     fmpz_init(Q);
170 
171     if (r > prec)
172         flint_abort();
173 
174     /* Binary splitting (+1 fixed-point ulp truncation error). */
175     xmag = fmpz_bits(x) - r;
176     N = bs_num_terms(xmag, prec);
177    _arb_sin_sum_bs_powtab(T, Q, Qexp, x, r, N);
178 
179     /* we still need to multiply and add x/2^r to get sine */
180     fmpz_mul(T, T, x);
181     Qexp[0] += r;
182 
183     /* T = T / Q  (+1 fixed-point ulp error). */
184     if (Qexp[0] >= prec)
185     {
186         fmpz_tdiv_q_2exp(T, T, Qexp[0] - prec);
187         fmpz_tdiv_q(T, T, Q);
188     }
189     else
190     {
191         fmpz_mul_2exp(T, T, prec - Qexp[0]);
192         fmpz_tdiv_q(T, T, Q);
193     }
194 
195     fmpz_mul_2exp(Q, x, prec - r);
196     fmpz_add(T, T, Q);
197 
198     /* T = sin(u) with at most 2 fixed-point ulp error. */
199     arf_set_fmpz(arb_midref(wsin), T);
200     arf_mul_2exp_si(arb_midref(wsin), arb_midref(wsin), -prec);
201     mag_set_ui_2exp_si(arb_radref(wsin), 2, -prec);
202 
203     /* compute cos from sin */
204     arb_mul(wcos, wsin, wsin, prec);
205     arb_sub_ui(wcos, wcos, 1, prec);
206     arb_neg(wcos, wcos);
207     arb_sqrt(wcos, wcos, prec);
208 
209     fmpz_clear(T);
210     fmpz_clear(Q);
211 }
212 
213 void
arb_sin_cos_arf_bb(arb_t zsin,arb_t zcos,const arf_t x,slong prec)214 arb_sin_cos_arf_bb(arb_t zsin, arb_t zcos, const arf_t x, slong prec)
215 {
216     slong k, iter, bits, r, xmag, q, wp;
217     slong argred_bits, start_bits;
218     int inexact, negative;
219     fmpz_t t, u;
220     arb_t wcos, wsin, tmp1;
221 
222     if (zsin == NULL)
223     {
224         arb_init(tmp1);
225         arb_sin_cos_arf_bb(tmp1, zcos, x, prec);
226         arb_clear(tmp1);
227         return;
228     }
229 
230     if (zcos == NULL)
231     {
232         arb_init(tmp1);
233         arb_sin_cos_arf_bb(zsin, tmp1, x, prec);
234         arb_clear(tmp1);
235         return;
236     }
237 
238     if (arf_is_zero(x))
239     {
240         arb_zero(zsin);
241         arb_one(zcos);
242         return;
243     }
244 
245     xmag = arf_abs_bound_lt_2exp_si(x);
246     negative = arf_sgn(x) < 0;
247 
248     /* We assume that this function only gets called with something
249        reasonable as input (huge/tiny input will be handled by
250        the main sin/cos wrapper). */
251     if (arf_is_special(x) || arf_cmpabs_d(x, 3.15) > 0 || xmag < -2 * prec - 100)
252     {
253         flint_printf("arb_sin_cos_arf_bb: unexpectedly large/small input\n");
254         flint_abort();
255     }
256 
257     argred_bits = 24;
258     start_bits = argred_bits * 3;
259 
260     q = FLINT_MAX(0, xmag + argred_bits);
261     if (q <= 2)
262         q = 0;
263 
264     wp = prec + 10 + 2 * (q - xmag) + 2 * FLINT_BIT_COUNT(prec);
265 
266     fmpz_init(t);
267     fmpz_init(u);
268     arb_init(wcos);
269     arb_init(wsin);
270     arb_init(tmp1);
271 
272     /* Convert x/2^q to a fixed-point number. */
273     inexact = arf_get_fmpz_fixed_si(t, x, -wp + q);
274     fmpz_abs(t, t);
275 
276     /* Aliasing of z and x is safe now that only use t. */
277     /* Start with z = 1. */
278     arb_one(zcos);
279     arb_zero(zsin);
280 
281     /* Bit-burst loop. */
282     for (iter = 0, bits = start_bits; !fmpz_is_zero(t); iter++, bits *= 3)
283     {
284         /* Extract bits. */
285         r = FLINT_MIN(bits, wp);
286         fmpz_tdiv_q_2exp(u, t, wp - r);
287 
288         arb_sin_cos_fmpz_div_2exp_bsplit(wsin, wcos, u, r, wp);
289 
290         /* Remove used bits. */
291         fmpz_mul_2exp(u, u, wp - r);
292         fmpz_sub(t, t, u);
293 
294         /* zsin, zcos = zsin wcos + zcos wsin, zcos wcos - zsin wsin */
295         /* using karatsuba */
296         arb_add(tmp1, zsin, zcos, wp);
297         arb_mul(zcos, zcos, wcos, wp);
298         arb_add(wcos, wcos, wsin, wp);
299         arb_mul(wsin, wsin, zsin, wp);
300         arb_mul(tmp1, tmp1, wcos, wp);
301         arb_sub(zsin, tmp1, wsin, wp);
302         arb_sub(zsin, zsin, zcos, wp);
303         arb_sub(zcos, zcos, wsin, wp);
304         arb_zero(tmp1);  /* slightly reduce memory usage */
305     }
306 
307     /* Initial fixed-point truncation error. */
308     if (inexact)
309     {
310         arb_add_error_2exp_si(zcos, -wp);
311         arb_add_error_2exp_si(zsin, -wp);
312     }
313 
314     if (q != 0)
315     {
316         /* cos(x) = 2 cos(x/2)^2 - 1 */
317         for (k = 0; k < q; k++)
318         {
319             arb_mul(zcos, zcos, zcos, wp);
320             arb_mul_2exp_si(zcos, zcos, 1);
321             arb_sub_ui(zcos, zcos, 1, wp);
322         }
323 
324         arb_mul(tmp1, zcos, zcos, wp);
325         arb_sub_ui(tmp1, tmp1, 1, wp);
326         arb_neg(tmp1, tmp1);
327         arb_sqrt(zsin, tmp1, wp);
328     }
329 
330     if (negative)
331         arb_neg(zsin, zsin);
332 
333     arb_set_round(zsin, zsin, prec);
334     arb_set_round(zcos, zcos, prec);
335 
336     fmpz_clear(t);
337     fmpz_clear(u);
338     arb_clear(wcos);
339     arb_clear(wsin);
340     arb_clear(tmp1);
341 }
342 
343