1 // arith07.cpp Copyright (C) 1990-2021 Codemist
2
3 //
4 // Arithmetic functions. negation plus a load of Common Lisp things
5 // for support of complex numbers.
6 //
7 //
8
9 /**************************************************************************
10 * Copyright (C) 2021, Codemist. A C Norman *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are *
14 * met: *
15 * *
16 * * Redistributions of source code must retain the relevant *
17 * copyright notice, this list of conditions and the following *
18 * disclaimer. *
19 * * Redistributions in binary form must reproduce the above *
20 * copyright notice, this list of conditions and the following *
21 * disclaimer in the documentation and/or other materials provided *
22 * with the distribution. *
23 * *
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
25 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT *
26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS *
27 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE *
28 * COPYRIGHT OWNERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, *
29 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, *
30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
31 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
32 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR *
33 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF *
34 * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH *
35 * DAMAGE. *
36 *************************************************************************/
37
38
39 // $Id: arith07.cpp 5604 2021-01-20 22:22:10Z arthurcnorman $
40
41 #include "headers.h"
42
43
copyb(LispObject a)44 LispObject copyb(LispObject a)
45 //
46 // copy a bignum.
47 //
48 { LispObject b;
49 size_t len = bignum_length(a), i;
50 Save save(a);
51 b = get_basic_vector(TAG_NUMBERS, TYPE_BIGNUM, len);
52 save.restore(a);
53 len = (len-CELL)/4;
54 for (i=0; i<len; i++)
55 bignum_digits(b)[i] = vbignum_digits(a)[i];
56 return b;
57 }
58
negateb(LispObject a)59 LispObject negateb(LispObject a)
60 //
61 // Negate a bignum. Note that negating the 1-word bignum
62 // value of 0x08000000 will produce a fixnum as a result (for 32 bits),
63 // which might confuse the caller... in a similar way negating
64 // the value -0x40000000 will need to promote from a one-word
65 // bignum to a 2-word bignum. How messy just for negation!
66 // And on 64 bit systems the same effect applies but with larger values!
67 // In an analogous manner negating the positive number of the style
68 // 0x40000000 can lead to a negative result that uses one less digit.
69 // Well on a 64-bit machine it is a 2-word bignum that can end up
70 // negated to get a fixnum result.
71 //
72 { LispObject b;
73 size_t len = bignum_length(a), i;
74 int32_t carry;
75 // There are two messy special cases here. The first is that there is a
76 // positive value (2^27 or 2^59) which has to be represented as a bignum,
77 // but when you negate it you get a fixnum.
78 // Then there will be negative values (the smallest being -2^31 or -2^62)
79 // that fit in a certain number of words of bignum, but their absolute
80 // value needs one more word...
81 // Note that on a 64-bit machine there ought never to be any one-word
82 // bignums because all the values representable with just one 31-bit digit
83 // can be handled as fixnums instead.
84 if (SIXTY_FOUR_BIT &&
85 len == CELL+8) // two-word bignum - do specially
86 { if (bignum_digits(a)[0] == 0 &&
87 bignum_digits(a)[1] == (int32_t)0x10000000)
88 return MOST_NEGATIVE_FIXNUM;
89 else if (bignum_digits(a)[0] == 0 &&
90 (int32_t)bignum_digits(a)[1] == -(int32_t)(1<<30))
91 return make_three_word_bignum(0, 1<<30, 0);
92 uint32_t d0 = bignum_digits(a)[0];
93 int32_t d1 = (int32_t)~bignum_digits(a)[1];
94 if (d0 == 0) d1++;
95 else return make_two_word_bignum(d1, (-d0) & 0x7fffffff);
96 }
97 if (!SIXTY_FOUR_BIT &&
98 len == CELL+4) // one-word bignum - do specially
99 { int32_t d0 = -(int32_t)bignum_digits(a)[0];
100 if (d0 == MOST_NEGATIVE_FIXVAL) return MOST_NEGATIVE_FIXNUM;
101 else if (d0 == 0x40000000) return make_two_word_bignum(0, d0);
102 else return make_one_word_bignum(d0);
103 }
104 { Save save(a);
105 b = get_basic_vector(TAG_NUMBERS, TYPE_BIGNUM, len);
106 errexit();
107 save.restore(a);
108 }
109 len = (len-CELL-4)/4;
110 carry = -1;
111 for (i=0; i<len; i++)
112 {
113 // The next couple of lines really caught me out wrt compiler optimisation
114 // before I put in all the casts. I used to have what was in effect
115 // carry = (signed_x ^ 0x7fffffff) + (int32_t)((uint32_t)carry>>31);
116 // ... ((uint32_t)carry >> 31);
117 // and a compiler seems to have observed that the masking leaves the left
118 // operand of the addition positive, and that the unsigned shift right
119 // leaves the right operand positive too. So based on an assumption that
120 // signed integer overflow will not happen it deduces that the sum will also
121 // be positive, and hence that on the next line (carry>>31) will be zero.
122 // For the assumption to fail there will have had to be integer overflow, and
123 // the C/C++ standards say that the consequence of that are undefined - a term
124 // that can include behaviour as per the optimised code here.
125 //
126 // To avoid that I am working on the basis that casts between int32_t and
127 // uint32_t will leave bit representations unchanged and that arithmetic uses
128 // twos complement for signed values. Then by casting to unsigned at times
129 // I can allow a carry to propagate into the top bit of a word without that
130 // counting as an overflow, and that should force the compiler to do the
131 // arithmetic in full.
132 //
133 // Having spotted this particular case I now worry about how many related
134 // ones there may be hiding in the code!
135 //
136 carry = ADD32(clear_top_bit(~bignum_digits(a)[i]),
137 top_bit(carry));
138 bignum_digits(b)[i] = clear_top_bit(carry);
139 }
140 // Handle the top digit separately since it is signed.
141 carry = ADD32(~bignum_digits(a)[i], top_bit(carry));
142 if (!signed_overflow(carry))
143 {
144 // If the most significant word ends up as -1 then I just might
145 // have 0x40000000 in the next word down and so I may need to shrink
146 // the number. Since I handled 1-word bignums specially I have at
147 // least two words to deal with here.
148 if (carry == -1 && (bignum_digits(b)[i-1] & 0x40000000) != 0)
149 { bignum_digits(b)[i-1] |= ~0x7fffffff;
150 setnumhdr(b, numhdr(b) - pack_hdrlength(1));
151 if (SIXTY_FOUR_BIT)
152 { if ((i & 1) != 0) bignum_digits(b)[i] = 0;
153 else *reinterpret_cast<Header *>(&bignum_digits(b)[i]) = make_bighdr(
154 2);
155 }
156 else
157 { if ((i & 1) == 0) bignum_digits(b)[i] = 0;
158 else *reinterpret_cast<Header *>(&bignum_digits(b)[i]) = make_bighdr(
159 2);
160 }
161 }
162 else bignum_digits(b)[i] = carry; // no shrinking needed
163 return b;
164 }
165 // Here I have overflow: this can only happen when I negate a number
166 // that started off with 0xc0000000 in the most significant digit,
167 // and I have to pad a zero word onto the front.
168 bignum_digits(b)[i] = clear_top_bit(carry);
169 return lengthen_by_one_bit(b, carry);
170 }
171
172 //
173 // generic negation
174 //
175
negate(LispObject a)176 LispObject negate(LispObject a)
177 { switch (static_cast<int>(a) & TAG_BITS)
178 { case TAG_FIXNUM:
179 if (!SIXTY_FOUR_BIT && is_sfloat(a))
180 return a ^ 0x80000000U;
181 if (SIXTY_FOUR_BIT && is_sfloat(a))
182 return a ^ UINT64_C(0x8000000000000000);
183 else return make_lisp_integer64(-int_of_fixnum(a));
184 case TAG_NUMBERS:
185 { int32_t ha = type_of_header(numhdr(a));
186 switch (ha)
187 { case TYPE_BIGNUM:
188 return negateb(a);
189 case TYPE_RATNUM:
190 { LispObject n = numerator(a),
191 d = denominator(a);
192 Save save(d);
193 n = negate(n);
194 save.restore(d);
195 return make_ratio(n, d);
196 }
197 case TYPE_COMPLEX_NUM:
198 { LispObject r = real_part(a),
199 i = imag_part(a);
200 Save save(i);
201 r = negate(r);
202 save.restore(i);
203 Save save1(r);
204 i = negate(i);
205 save1.restore(r);
206 return make_complex(r, i);
207 }
208 default:
209 return aerror1("bad arg for minus", a);
210 }
211 }
212 case TAG_BOXFLOAT:
213 switch (type_of_header(flthdr(a)))
214 { case TYPE_SINGLE_FLOAT:
215 return make_boxfloat(-single_float_val(a),
216 TYPE_SINGLE_FLOAT);
217 case TYPE_DOUBLE_FLOAT:
218 return make_boxfloat(-double_float_val(a),
219 TYPE_DOUBLE_FLOAT);
220 #ifdef HAVE_SOFTFLOAT
221 case TYPE_LONG_FLOAT:
222 { float128_t aa = long_float_val(a);
223 f128M_negate(&aa);
224 return make_boxfloat128(aa);
225 }
226 #endif // HAVE_SOFTFLOAT
227 }
228 default:
229 return aerror1("bad arg for minus", a);
230 }
231 }
232
233
234 /*****************************************************************************/
235 //** Transcendental functions etcetera. **
236 /*****************************************************************************/
237
238
239 //
240 // Much of the code here is extracted from the portable Fortran library
241 // used by Codemist with its Fortran compiler.
242 //
243
244 //
245 // The object of the following macro is to adjust the floating point
246 // variables concerned so that the more significant one can be squared
247 // with NO LOSS OF PRECISION. It is only used when there is no danger
248 // of over- or under-flow.
249 //
250 // This code is NOT PORTABLE but can be modified for use elsewhere
251 // It should, however, serve for IEEE and IBM FP formats.
252 //
253
254 typedef union _char_double
255 { double d;
256 char c[8];
257 } char_double_union;
258
259 #ifdef LITTLEENDIAN
260 #define LOW_BITS_OFFSET 0
261 #else
262 #define LOW_BITS_OFFSET 4
263 #endif
264
265 // The code here explictly puns between a double and a row of char values
266 // so that it can force the bottom 32-bits of the represenattion of the
267 // double to be zero. The use of the char type here and then memset to clear
268 // it is intended to keep me safe from strict-aliasing concerns, and modern
269 // C compilers are liable to map the use of memset onto a simple store
270 // instruction.
271
272 #define _fp_normalize(high, low) \
273 { char_double_union temp; /* access to representation */ \
274 temp.d = high; /* take original number */ \
275 std::memset(&temp.c[LOW_BITS_OFFSET], 0, 4); \
276 /* make low part of mantissa 0 */ \
277 low += (high - temp.d); /* add into low-order result */ \
278 high = temp.d; \
279 }
280
281 //
282 // A modern C system will provide a datatype "complex double" which
283 // will (I hope) provide direct implementations of some things I need
284 // here. However in the end I may prefer not to use it because for
285 // real floating point I am using crlibm that implements correctly
286 // rounded and hence consistent across all platforms values. If I use
287 // that as a basis for my complex code I will at least get bit-for-bit
288 // identical results everywhere even if I do not manage to achieve
289 // correctly rounded last-bit performance in all cases.
290 // log, sqrt and all the inverse trig functions here need careful review
291 // as to their treatment of -0.0 on branch-cuts!
292 //
293
Cabs(Complex z)294 double Cabs(Complex z)
295 {
296 //
297 // Obtain the absolute value of a complex number - note that the main
298 // agony here is in ensuring that neither overflow nor underflow can
299 // wreck the calculation. Given ideal arithmetic the sum could be carried
300 // through as just sqrt(x^2 + y^2).
301 //
302 double x = z.real, y = z.imag;
303 double scale;
304 int n1, n2;
305 if (x==0.0) return std::fabs(y);
306 else if (y==0.0) return std::fabs(x);
307 static_cast<void>(std::frexp(x, &n1));
308 static_cast<void>(std::frexp(y, &n2));
309 // The exact range of values returned by frexp does not matter here
310 if (n2>n1) n1 = n2;
311 // n1 is now the exponent of the larger (in absolute value) of x, y
312 scale = std::ldexp(1.0, n1); // can not be 0.0
313 x /= scale;
314 y /= scale;
315 // The above scaling operation introduces no rounding error (since the
316 // scale factor is exactly a power of 2). It reduces the larger of x, y
317 // to be somewhere near 1.0 so overflow in x*x+y*y is impossible. It is
318 // still possible that one of x*x and y*y will underflow (but not both)
319 // but this is harmless.
320 return scale * std::sqrt(x*x + y*y);
321 }
322
Ccos(Complex z)323 Complex Ccos(Complex z)
324 { double x = z.real, y = z.imag;
325 //
326 // cos(x + iy) = cos(x)*cosh(y) - i sin(x)*sinh(y)
327 // For smallish y this can be used directly. For |y| > 50 I will
328 // compute sinh and cosh as just +/- exp(|y|)/2
329 //
330 double s = std::sin(x), c = std::cos(x);
331 double absy = std::fabs(y);
332 if (absy <= 50.0)
333 { double sh = std::sinh(y), ch = std::cosh(y);
334 z.real = c*ch;
335 z.imag = - s*sh;
336 return z;
337 }
338 else
339 { double w;
340 int n = _reduced_exp(absy, &w) - 1;
341 z.real = std::ldexp(c*w, n);
342 if (y < 0.0) z.imag = std::ldexp(s*w, n);
343 else z.imag = std::ldexp(-s*w, n);
344 return z;
345 }
346 }
347
reduced_power(double a,int n)348 static double reduced_power(double a, int n)
349 {
350 //
351 // Compute (1 + a)^n - 1 avoiding undue roundoff error.
352 // Assumes n >= 1 on entry and that a is small.
353 //
354 if (n == 1) return a;
355 { double d = reduced_power(a, n/2);
356 d = (2.0 + d)*d;
357 if (n & 1) d += (1.0 + d)*a;
358 return d;
359 }
360 }
361
362 //
363 // The following value is included for documentation purposes - it
364 // give the largest args that can be given to exp() without leading to
365 // overflow on IEEE-arithmetic machines.
366 // #define _exp_arg_limit 709.78271289338397
367 // Note that in any case exp(50.0) will not overflow (it is only 5.2e21),
368 // so it can be evaluated the simple direct way.
369 //
370
_reduced_exp(double x,double * r)371 int _reduced_exp(double x, double *r)
372 {
373 //
374 // (*r) = exp(x)/2^n; return n;
375 // where n will be selected so that *r gets set to a fairly small value
376 // (precise range of r unimportant provided it will be WELL away from
377 // chances of overflow, even when exp(x) would actually overflow). This
378 // function may only be called with argument x that is positive and
379 // large enough that n will end up satisfying n>=1. The coding here
380 // will ensure that if x>4.0, and in general the use of this function
381 // will only be for x > 50.
382 // For IBM hardware it would be good to be able to control the value
383 // of n mod 4, maybe, to help counter wobbling precision. This is not
384 // done here.
385 //
386 int n;
387 double f;
388 n = static_cast<int>(x / 7.625 + 0.5);
389 //
390 // 7.625 = 61/8 and is expected to have an exact floating point
391 // representation here, so f is computed without any rounding error.
392 // (do I need something like the (x - 0.5) - 0.5 trick here?)
393 //
394 f = std::exp(x - 7.625*static_cast<double>(n));
395 //
396 // the magic constant is ((exp(61/8) / 2048) - 1) and it arises because
397 // 61/88 is a decent rational approximation to log(2), hence exp(61/8)
398 // is almost 2^11. Thus I compute exp(x) as
399 // 2^(11*n) * (exp(61/8)/2^11)^n * exp(f)
400 // The first factor is exact, the second is (1+e)^n where e is small and
401 // n is an integer, so can be computer accurately, and the residue f at the
402 // end is small enough not to give over-bad trouble.
403 // The numeric constant given here was calculated with the REDUCE 3.3
404 // bigfloat package.
405 //
406 #define _e61q8 3.81086435594567676751e-4
407 *r = reduced_power(_e61q8, n)*f + f;
408 #undef _e61q8
409 return 11*n;
410 }
411
Cexp(Complex z)412 Complex Cexp(Complex z)
413 { double x = z.real, y = z.imag;
414 //
415 // value is exp(x)*(cos(y) + i sin(y)) but have care with overflow
416 // Here (and throughout the complex library) there is an opportunity
417 // to save time by computing sin(y) and cos(y) together. Since this
418 // code is (to begin with) to sit on top of an arbitrary C library,
419 // perhaps with hardware support for the calculation of real-valued
420 // trig functions I am not going to try to realise this saving.
421 //
422 double s = std::sin(y), c = std::cos(y);
423 //
424 // if x > 50 I will use a cautious sceme which computes exp(x) with
425 // its (binary) exponent separated. Note that 50.0 is chosen as a
426 // number noticably smaller than _exp_arg_limit (exp(50) = 5.18e21),
427 // but is not a critical very special number.
428 //
429 if (x <= 50.0) // includes x < 0.0, of course
430 { double w = std::exp(x);
431 z.real = w*c;
432 z.imag = w*s;
433 return z;
434 }
435 else
436 { double w;
437 int n = _reduced_exp(x, &w);
438 z.real = std::ldexp(w*c, n);
439 z.imag = std::ldexp(w*s, n);
440 return z;
441 }
442 }
443
Cln(Complex z)444 Complex Cln(Complex z)
445 { double x = z.real, y = z.imag;
446 //
447 // if x and y are both very large then cabs(z) may be out of range
448 // even though log or if is OK. Thus it is necessary to perform an
449 // elaborate scaled calculation here, and not just
450 // z.real = log(cabs(z));
451 //
452 double scale, r;
453 int n1, n2;
454 if (x==0.0) r = std::log(std::fabs(y));
455 else if (y==0.0) r = std::log(std::fabs(x));
456 else
457 { static_cast<void>(std::frexp(x, &n1));
458 static_cast<void>(std::frexp(y, &n2));
459 // The exact range of values returned by frexp does not matter here
460 if (n2>n1) n1 = n2;
461 scale = std::ldexp(1.0, n1);
462 x /= scale;
463 y /= scale;
464 r = std::log(scale) + 0.5*std::log(x*x + y*y);
465 }
466 z.real = r;
467 //
468 // The C standard is not very explicit about the behaviour of atan2(0.0, -n)
469 // while for Fortran it is necessary that this returns +pi not -pi. Hence
470 // with extreme caution I put a special test here.
471 //
472 if (y == 0.0)
473 if (x < 0.0) z.imag = _pi;
474 else z.imag = 0.0;
475 else z.imag = std::atan2(y, x);
476 return z;
477 }
478
479
480 //
481 // Complex raising to a power. This seems to be pretty nasty
482 // to get right, and the code includes extra high precision variants
483 // on atan() and log(). Further refinements wrt efficiency may be
484 // possible later on.
485 // This code has been partially tested, and seems to be uniformly
486 // better than using just a**b = exp(b*log(a)), but much more careful
487 // study is needed before it can possibly be claimed that it is
488 // right in the sense of not throwing away accuracy when it does not
489 // have to. I also need to make careful checks to verify that the
490 // correct (principal) value is computed.
491 //
492
493 //
494 // The next function is used after arithmetic has been done on extra-
495 // precision numbers so that the relationship between high and low parts
496 // is no longer known. Re-instate it.
497 //
498
499 #define _two_minus_25 2.98023223876953125e-8 // 2^(-25)
500
fp_add(double a,double b,double * lowres)501 static double fp_add(double a, double b, double *lowres)
502 {
503 // Result is the high part of a+b, with the low part assigned to *lowres
504 double absa, absb;
505 if (a >= 0.0) absa = a;
506 else absa = -a;
507 if (b >= 0.0) absb = b;
508 else absb = -b;
509 if (absa < absb)
510 { double t = a; a = b; b = t;
511 }
512 // Now a is the larger (in absolute value) of the two numbers
513 if (absb > absa * _two_minus_25)
514 { double al = 0.0, bl = 0.0;
515 //
516 // If the exponent difference beweeen a and b is no more than 25 then
517 // I can add the top part (20 or 24 bits) of a to the top part of b
518 // without going beyond the 52 or 56 bits that a full mantissa can hold.
519 //
520 _fp_normalize(a, al);
521 _fp_normalize(b, bl);
522 a = a + b; // No rounding needed here
523 b = al + bl;
524 if (a == 0.0)
525 { a = b;
526 b = 0.0;
527 }
528 }
529 //
530 // The above step leaves b small wrt the value in a (unless a+b led
531 // to substantial cancellation of leading digits), but leaves the high
532 // part a with bits everywhere. Force low part of a to zero.
533 //
534 { double al = 0.0;
535 _fp_normalize(a, al);
536 b = b + al;
537 }
538 if (a >= 0.0) absa = a;
539 else absa = -a;
540 if (b >= 0.0) absb = b;
541 else absb = -b;
542 if (absb > absa * _two_minus_25)
543 //
544 // If on input a is close to -b, then a+b is close to zero. In this
545 // case the exponents of a abd b matched, and so earlier calculations
546 // have all been done exactly. Go around again to split residue into
547 // high and low parts.
548 //
549 { double al = 0.0, bl = 0.0;
550 _fp_normalize(b, bl);
551 a = a + b;
552 _fp_normalize(a, al);
553 b = bl + al;
554 }
555 *lowres = b;
556 return a;
557 }
558
559 #undef _two_minus_25
560
extended_atan2(double b,double a,double * thetah,double * thetal)561 static void extended_atan2(double b, double a, double *thetah,
562 double *thetal)
563 { int octant;
564 double rh, rl, thh, thl;
565 //
566 // First reduce the argument to the first octant (i.e. a, b both +ve,
567 // and b <= a).
568 //
569 if (b < 0.0)
570 { octant = 4;
571 a = -a;
572 b = -b;
573 }
574 else octant = 0;
575 if (a < 0.0)
576 { double t = a;
577 octant += 2;
578 a = b;
579 b = -t;
580 }
581 if (b > a)
582 { double t = a;
583 octant += 1;
584 a = b;
585 b = t;
586 }
587
588 { static struct
589 { double h;
590 double l;
591 } _atan[] =
592 {
593 //
594 // The table here gives atan(n/16) in 1.5-precision for n=0..16
595 // Note that all the magic numbers used in this file were calculated
596 // using the REDUCE bigfloat package, and all the 'exact' parts have
597 // a denominator of at worst 2^26 and so are expected to have lots
598 // of trailing zero bits in their floating point representation.
599 //
600 { 0.0, 0.0 },
601 { 0.06241881847381591796875, -0.84778585694947708870e-8 },
602 { 0.124355018138885498046875, -2.35921240630155201508e-8 },
603 { 0.185347974300384521484375, -2.43046897565983490387e-8 },
604 { 0.2449786663055419921875, -0.31786778380154175187e-8 },
605 { 0.302884876728057861328125, -0.83530864557675689054e-8 },
606 { 0.358770668506622314453125, 0.17639499059427950639e-8 },
607 { 0.412410438060760498046875, 0.35366268088529162896e-8 },
608 { 0.4636476039886474609375, 0.50121586552767562314e-8 },
609 { 0.512389481067657470703125, -2.07569197640365239794e-8 },
610 { 0.558599293231964111328125, 2.21115983246433832164e-8 },
611 { 0.602287352085113525390625, -0.59501493437085023057e-8 },
612 { 0.643501102924346923828125, 0.58689374629746842287e-8 },
613 { 0.6823165416717529296875, 1.32029951485689299817e-8 },
614 { 0.71882998943328857421875, 1.01883359311982641515e-8 },
615 { 0.75315129756927490234375, -1.66070805128190106297e-8 },
616 { 0.785398185253143310546875, -2.18556950009312141541e-8 }
617 };
618 int k = static_cast<int>(16.0*(b/a + 0.03125)); // 0 to 16
619 double kd = static_cast<double>(k)/16.0;
620 double ah = a, al = 0.0,
621 bh = b, bl = 0.0,
622 ch, cl, q, q2;
623 _fp_normalize(ah, al);
624 _fp_normalize(bh, bl);
625 ch = bh - ah*kd; cl = bl - al*kd;
626 ah = ah + bh*kd; al = al + bl*kd;
627 bh = ch; bl = cl;
628 // Now |(a/b)| <= 1/32
629 ah = fp_add(ah, al, &al); // Re-normalise
630 bh = fp_add(bh, bl, &bl);
631 // Compute approximation to b/a
632 rh = (bh + bl)/(ah + al); rl = 0.0;
633 _fp_normalize(rh, rl);
634 bh -= ah*rh; bl -= al*rh;
635 rl = (bh + bl)/(ah + al); // Quotient now formed
636 //
637 // Now it is necessary to compute atan(q) to one-and-a-half precision.
638 // Since |q| < 1/32 I will leave just q as the high order word of
639 // the result and compute atan(q)-q as a single precision value. This
640 // gives about 16 bits accuracy beyond regular single precision work.
641 //
642 q = rh + rl;
643 q2 = q*q;
644 // The expansion the follows could be done better using a minimax poly
645 rl -= q*q2*(0.33333333333333333333 -
646 q2*(0.20000000000000000000 -
647 q2*(0.14285714285714285714 -
648 q2*(0.11111111111111111111 -
649 q2* 0.09090909090909090909))));
650 // OK - now (rh, rl) is atan(reduced b/a). Need to add on atan(kd)
651 rh += _atan[k].h;
652 rl += _atan[k].l;
653 }
654 //
655 // The following constants give high precision versions of pi and pi/2,
656 // and the high partwords (p2h and pih) have lots of low order zero bits
657 // in their binary representation. Expect (=require) that the arithmetic
658 // that computes thh is done without introduced rounding error.
659 //
660 #define _p2h 1.57079632580280303955078125
661 #define _p2l 9.92093579680540441639751e-10
662
663 #define _pih 3.14159265160560607910156250
664 #define _pil 1.984187159361080883279502e-9
665
666 switch (octant)
667 { default:
668 case 0: thh = rh; thl = rl; break;
669 case 1: thh = _p2h - rh; thl = _p2l - rl; break;
670 case 2: thh = _p2h + rh; thl = _p2l + rl; break;
671 case 3: thh = _pih - rh; thl = _pil - rl; break;
672 case 4: thh = -_pih + rh; thl = -_pil + rl; break;
673 case 5: thh = -_p2h - rh; thl = -_p2l - rl; break;
674 case 6: thh = -_p2h + rh; thl = -_p2l + rl; break;
675 case 7: thh = -rh; thl = -rl; break;
676 }
677
678 #undef _p2h
679 #undef _p2l
680 #undef _pih
681 #undef _pil
682
683 *thetah = fp_add(thh, thl, thetal);
684 }
685
extended_log(int k,double a,double b,double * logrh,double * logrl)686 static void extended_log(int k, double a, double b,
687 double *logrh, double *logrl)
688 {
689 //
690 // If we had exact arithmetic this procedure could be:
691 // k*log(2) + 0.5*log(a^2 + b^2)
692 //
693 double al = 0.0, bl = 0.0, all = 0.0, bll = 0.0, c, ch, cl, cll;
694 double w, q, qh, ql, rh, rl;
695 int n;
696 //
697 // First (a^2 + b^2) is calculated, using extra precision.
698 // Because any rounding at this stage can lead to bad errors in
699 // the power that I eventually want to compute, I use 3-word
700 // arithmetic here, and with the version of _fp_normalize given
701 // above and IEEE or IBM370 arithmetic this part of the
702 // computation is exact.
703 //
704 _fp_normalize(a, al); _fp_normalize(al, all);
705 _fp_normalize(b, bl); _fp_normalize(bl, bll);
706 ch = a*a + b*b;
707 cl = 2.0*(a*al + b*bl);
708 cll = (al*al + bl*bl) +
709 all*(2.0*(a + al) + all) +
710 bll*(2.0*(b + bl) + bll);
711 _fp_normalize(ch, cl);
712 _fp_normalize(cl, cll);
713 c = ch + (cl + cll); // single precision approximation
714 //
715 // At this stage the scaling of the input value will mean that we
716 // have 0.25 <= c <= 2.0
717 //
718 // Now rewrite things as
719 // (2*k + n)*log(s) + 0.5*log((a^2 + b^2)/2^n))
720 // where s = sqrt(2)
721 // and where the arg to the log is in sqrt(0.5), sqrt(2)
722 //
723
724 #define _sqrt_half 0.70710678118654752440
725 #define _sqrt_two 1.41421356237309504880
726 k = 2*k;
727 while (c < _sqrt_half)
728 { k -= 1;
729 ch *= 2.0;
730 cl *= 2.0;
731 cll *= 2.0;
732 c *= 2.0;
733 }
734 while (c > _sqrt_two)
735 { k += 1;
736 ch *= 0.5;
737 cl *= 0.5;
738 cll *= 0.5;
739 c *= 0.5;
740 }
741 #undef _sqrt_half
742 #undef _sqrt_two
743 n = static_cast<int>(16.0/c + 0.5);
744 w = static_cast<double>(n) / 16.0;
745 ch *= w;
746 cl *= w;
747 cll *= w; // Now |c-1| < 0.04317
748 ch = (ch - 0.5) - 0.5;
749 ch = fp_add(ch, cl, &cl);
750 cl = cl + cll;
751 //
752 // (ch, cl) is now the reduced argument ready for calculating log(1+c),
753 // and now that the reduction is over I can drop back to the use of just
754 // two doubleprecision values to represent c.
755 //
756 c = ch + cl;
757 qh = c / (2.0 + c);
758 ql = 0.0;
759 _fp_normalize(qh, ql);
760 ql = ((ch - qh*(2.0 + ch)) + cl - qh*cl) / (2.0 + c);
761 // (qh, ql) is now c/(2.0 + c)
762 rh = qh; // 18 bits bigger than low part will end up
763 q = qh + ql;
764 w = q*q;
765 rl = ql + q*w*(0.33333333333333333333 +
766 w*(0.20000000000000000000 +
767 w*(0.14285714285714285714 +
768 w*(0.11111111111111111111 +
769 w*(0.09090909090909090909)))));
770 //
771 // (rh, rl) is now atan(c) correct to double precision plus about 18 bits.
772 //
773
774 { double temp;
775 static struct
776 { double h;
777 double l;
778 } _log_table[] =
779 {
780 //
781 // The following values are (in extra precision) -log(n/16)/2 for n
782 // in the range 11 to 23 (i.e. roughly over sqrt(2)/2 to sqrt(2))
783 //
784 { 0.1873466968536376953125, 2.786706765149099245e-8 },
785 { 0.1438410282135009765625, 0.801238948715710950e-8 },
786 { 0.103819668292999267578125, 1.409612298322959552e-8 },
787 { 0.066765725612640380859375, -2.930037906928620319e-8 },
788 { 0.0322692394256591796875, 2.114312640614896196e-8 },
789 { 0.0, 0.0 },
790 { -0.0303122997283935546875, -1.117982386660280307e-8 },
791 { -0.0588915348052978515625, 1.697710612429310295e-8 },
792 { -0.08592510223388671875, -2.622944289242004947e-8 },
793 { -0.111571788787841796875, 1.313073691899185245e-8 },
794 { -0.135966837406158447265625, -2.033566243215020975e-8 },
795 { -0.159226894378662109375, 2.881939480146987639e-8 },
796 { -0.18145275115966796875, 0.431498374218108783e-8 }
797 };
798
799 rh = fp_add(rh, _log_table[n-11].h, &temp);
800 rl = temp + _log_table[n-11].l + rl;
801 }
802
803 #define _exact_part_logroot2 0.3465735912322998046875
804 #define _approx_part_logroot2 (-9.5232714997888393927e-10)
805 // Multiply this by k and add it in
806 { double temp, kd = static_cast<double>(k);
807 rh = fp_add(rh, kd*_exact_part_logroot2, &temp);
808 rl = rl + temp + kd*_approx_part_logroot2;
809 }
810 #undef _exact_part_logroot2
811 #undef _approx_part_logroot2
812
813 *logrh = rh;
814 *logrl = rl;
815 return;
816 }
817
818
Cpow(Complex z1,Complex z2)819 Complex Cpow(Complex z1, Complex z2)
820 { double a = z1.real, b = z1.imag,
821 c = z2.real, d = z2.imag;
822 int k, m, n;
823 double scale;
824 double logrh, logrl, thetah, thetal;
825 double r, i, rh, rl, ih, il, clow, dlow, q;
826 double cw, sw, cost, sint;
827
828 if (b == 0.0 && d == 0.0 &&
829 a >= 0.0)// Simple case if both args are real
830 { z1.real = std::pow(a, c);
831 z1.imag = 0.0;
832 return z1;
833 }
834 //
835 // Start by scaling z1 by dividing out a power of 2 so that |z1| is
836 // fairly close to 1
837 //
838 if (a == 0.0)
839 { if (b == 0.0) return z1; // 0.0**anything is really an error
840 // The exact values returned by frexp do not matter here
841 static_cast<void>(std::frexp(b, &k));
842 }
843 else
844 { static_cast<void>(std::frexp(a, &k));
845 if (b != 0.0)
846 { int n;
847 static_cast<void>(std::frexp(b, &n));
848 if (n > k) k = n;
849 }
850 }
851 scale = std::ldexp(1.0, k);
852 a /= scale;
853 b /= scale;
854 //
855 // The idea next is to express z1 as r*exp(i theta), then z1**z2
856 // is exp(z2*log(z1)) and the exponent simplifies to
857 // (c*log r - d*theta) + i(c theta + d log r).
858 // Note r = scale*sqrt(a*a + b*b) now.
859 // The argument for exp() must be computed to noticably more than
860 // regular precision, since otherwise exp() greatly magnifies the
861 // error in the real part (if it is large and positive) and range
862 // reduction in the imaginary part can equally lead to accuracy
863 // problems. Neither c nor d can usefully be anywhere near the
864 // extreme range of floating point values, so I will not even try
865 // to scale them, thus I will end up happy(ish) if I can compute
866 // atan2(b, a) and log(|z1|) in one-and-a-half precision form.
867 // It would be best to compute theta in units of pi/4 rather than in
868 // raidians, since then the case of z^n (integer n) with arg(z) an
869 // exact multiple of pi/4 would work out better. However at present I
870 // just hope that the 1.5-precision working is good enough.
871 //
872 extended_atan2(b, a, &thetah, &thetal);
873 extended_log(k, a, b, &logrh, &logrl);
874
875 // Normalise all numbers
876 clow = 0.0;
877 dlow = 0.0;
878 _fp_normalize(c, clow);
879 _fp_normalize(d, dlow);
880
881 // (rh, rl) = c*logr - d*theta;
882 rh = c*logrh - d*thetah; // No rounding in this computation
883 rl = c*logrl + clow*(logrh + logrl) - d*thetal - dlow*
884 (thetah + thetal);
885
886 // (ih, il) = c*theta + d*logr;
887 ih = c*thetah + d*logrh; // No rounding in this computation
888 il = c*thetal + clow*(thetah + thetal) + d*logrl + dlow*
889 (logrh + logrl);
890
891 //
892 // Now it remains to take the exponential of the extended precision
893 // value in ((rh, rl), (ih, il)).
894 // Range reduce the real part by multiples of log(2), and the imaginary
895 // part by multiples of pi/2.
896 //
897
898 rh = fp_add(rh, rl, &rl); // renormalise
899 r = rh + rl; // Approximate value
900
901 #define _recip_log_2 1.4426950408889634074
902
903 q = r * _recip_log_2;
904 m = (q < 0.0) ? static_cast<int>(q - 0.5) : static_cast<int>(q + 0.5);
905 q = static_cast<double>(m);
906 #undef _recip_log_2
907 //
908 // log 2 = 11629080/2^24 - 0.000 00000 19046 54299 95776 78785
909 // to reasonable accuracy. It is vital that the exact part be
910 // read in as a number with lots of low zero bits, so when it gets
911 // multiplied by the integer q there is NO rounding needed.
912 //
913
914 #define _exact_part_log2 0.693147182464599609375
915 #define _approx_part_log2 (-1.9046542999577678785418e-9)
916 rh = rh - q*_exact_part_log2;
917 rl = rl - q*_approx_part_log2;
918 #undef _exact_part_log2
919 #undef _approx_part_log2
920 r = std::exp(rh + rl); // This should now be accurate enough
921
922 ih = fp_add(ih, il, &il);
923 i = ih + il; // Approximate value
924
925 #define _recip_pi_by_2 0.6366197723675813431
926
927 q = i * _recip_pi_by_2;
928 n = (q < 0.0) ? static_cast<int>(q - 0.5) : static_cast<int>(q + 0.5);
929 q = static_cast<double>(n);
930 //
931 // pi/2 = 105414357/2^26 + 0.000 00000 09920 93579 68054 04416 39751
932 // to reasonable accuracy. It is vital that the exact part be
933 // read in as a number with lots of low zero bits, so when it gets
934 // multiplied by the integer q there is NO rounding needed.
935 //
936
937 #define _exact_part_pi2 1.57079632580280303955078125
938 #define _approx_part_pi2 9.92093579680540441639751e-10
939 ih = ih - q*_exact_part_pi2;
940 il = il - q*_approx_part_pi2;
941 #undef _recip_pi_by_2
942 #undef _exact_part_pi2
943 #undef _approx_part_pi2
944 i = ih + il; // Now accurate enough
945 //
946 // Having done super-careful range reduction I can call the regular
947 // sin/cos routines here. If sin/cos could both be computed together
948 // that could speed things up a little bit, but by this stage they have
949 // not much in common.
950 //
951 cw = std::cos(i);
952 sw = std::sin(i);
953 switch (n & 3) // quadrant control
954 { default:
955 case 0: cost = cw; sint = sw; break;
956 case 1: cost = -sw; sint = cw; break;
957 case 2: cost = -cw; sint = -sw; break;
958 case 3: cost = sw; sint = -cw; break;
959 }
960
961 //
962 // Now, at long last, I can assemble the results and return.
963 //
964
965 z1.real = std::ldexp(r*cost, m);
966 z1.imag = std::ldexp(r*sint, m);
967 return z1;
968 }
969
970 //
971 // End of complex-to-complex-power code.
972 //
973
974 // end of arith07.cpp
975