1 //  arith07.cpp                           Copyright (C) 1990-2021 Codemist
2 
3 //
4 // Arithmetic functions.  negation plus a load of Common Lisp things
5 // for support of complex numbers.
6 //
7 //
8 
9 /**************************************************************************
10  * Copyright (C) 2021, Codemist.                         A C Norman       *
11  *                                                                        *
12  * Redistribution and use in source and binary forms, with or without     *
13  * modification, are permitted provided that the following conditions are *
14  * met:                                                                   *
15  *                                                                        *
16  *     * Redistributions of source code must retain the relevant          *
17  *       copyright notice, this list of conditions and the following      *
18  *       disclaimer.                                                      *
19  *     * Redistributions in binary form must reproduce the above          *
20  *       copyright notice, this list of conditions and the following      *
21  *       disclaimer in the documentation and/or other materials provided  *
22  *       with the distribution.                                           *
23  *                                                                        *
24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS    *
25  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT      *
26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS      *
27  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE         *
28  * COPYRIGHT OWNERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,   *
29  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,   *
30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS  *
31  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
32  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR  *
33  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF     *
34  * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH   *
35  * DAMAGE.                                                                *
36  *************************************************************************/
37 
38 
39 // $Id: arith07.cpp 5604 2021-01-20 22:22:10Z arthurcnorman $
40 
41 #include "headers.h"
42 
43 
copyb(LispObject a)44 LispObject copyb(LispObject a)
45 //
46 // copy a bignum.
47 //
48 {   LispObject b;
49     size_t len = bignum_length(a), i;
50     Save save(a);
51     b = get_basic_vector(TAG_NUMBERS, TYPE_BIGNUM, len);
52     save.restore(a);
53     len = (len-CELL)/4;
54     for (i=0; i<len; i++)
55         bignum_digits(b)[i] = vbignum_digits(a)[i];
56     return b;
57 }
58 
negateb(LispObject a)59 LispObject negateb(LispObject a)
60 //
61 // Negate a bignum.  Note that negating the 1-word bignum
62 // value of 0x08000000 will produce a fixnum as a result (for 32 bits),
63 // which might confuse the caller... in a similar way negating
64 // the value -0x40000000 will need to promote from a one-word
65 // bignum to a 2-word bignum.  How messy just for negation!
66 // And on 64 bit systems the same effect applies but with larger values!
67 // In an analogous manner negating the positive number of the style
68 // 0x40000000 can lead to a negative result that uses one less digit.
69 // Well on a 64-bit machine it is a 2-word bignum that can end up
70 // negated to get a fixnum result.
71 //
72 {   LispObject b;
73     size_t len = bignum_length(a), i;
74     int32_t carry;
75 // There are two messy special cases here. The first is that there is a
76 // positive value (2^27 or 2^59) which has to be represented as a bignum,
77 // but when you negate it you get a fixnum.
78 // Then there will be negative values (the smallest being -2^31 or -2^62)
79 // that fit in a certain number of words of bignum, but their absolute
80 // value needs one more word...
81 // Note that on a 64-bit machine there ought never to be any one-word
82 // bignums because all the values representable with just one 31-bit digit
83 // can be handled as fixnums instead.
84     if (SIXTY_FOUR_BIT &&
85         len == CELL+8)   // two-word bignum - do specially
86     {   if (bignum_digits(a)[0] == 0 &&
87             bignum_digits(a)[1] == (int32_t)0x10000000)
88             return MOST_NEGATIVE_FIXNUM;
89         else if (bignum_digits(a)[0] == 0 &&
90                  (int32_t)bignum_digits(a)[1] == -(int32_t)(1<<30))
91             return make_three_word_bignum(0, 1<<30, 0);
92         uint32_t d0 = bignum_digits(a)[0];
93         int32_t d1 = (int32_t)~bignum_digits(a)[1];
94         if (d0 == 0) d1++;
95         else return make_two_word_bignum(d1, (-d0) & 0x7fffffff);
96     }
97     if (!SIXTY_FOUR_BIT &&
98         len == CELL+4)   // one-word bignum - do specially
99     {   int32_t d0 = -(int32_t)bignum_digits(a)[0];
100         if (d0 == MOST_NEGATIVE_FIXVAL) return MOST_NEGATIVE_FIXNUM;
101         else if (d0 == 0x40000000) return make_two_word_bignum(0, d0);
102         else return make_one_word_bignum(d0);
103     }
104     {   Save save(a);
105         b = get_basic_vector(TAG_NUMBERS, TYPE_BIGNUM, len);
106         errexit();
107         save.restore(a);
108     }
109     len = (len-CELL-4)/4;
110     carry = -1;
111     for (i=0; i<len; i++)
112     {
113 // The next couple of lines really caught me out wrt compiler optimisation
114 // before I put in all the casts. I used to have what was in effect
115 //    carry = (signed_x ^ 0x7fffffff) + (int32_t)((uint32_t)carry>>31);
116 //      ... ((uint32_t)carry >> 31);
117 // and a compiler seems to have observed that the masking leaves the left
118 // operand of the addition positive, and that the unsigned shift right
119 // leaves the right operand positive too. So based on an assumption that
120 // signed integer overflow will not happen it deduces that the sum will also
121 // be positive, and hence that on the next line (carry>>31) will be zero.
122 // For the assumption to fail there will have had to be integer overflow, and
123 // the C/C++ standards say that the consequence of that are undefined - a term
124 // that can include behaviour as per the optimised code here.
125 //
126 // To avoid that I am working on the basis that casts between int32_t and
127 // uint32_t will leave bit representations unchanged and that arithmetic uses
128 // twos complement for signed values. Then by casting to unsigned at times
129 // I can allow a carry to propagate into the top bit of a word without that
130 // counting as an overflow, and that should force the compiler to do the
131 // arithmetic in full.
132 //
133 // Having spotted this particular case I now worry about how many related
134 // ones there may be hiding in the code!
135 //
136         carry = ADD32(clear_top_bit(~bignum_digits(a)[i]),
137                       top_bit(carry));
138         bignum_digits(b)[i] = clear_top_bit(carry);
139     }
140 // Handle the top digit separately since it is signed.
141     carry = ADD32(~bignum_digits(a)[i], top_bit(carry));
142     if (!signed_overflow(carry))
143     {
144 // If the most significant word ends up as -1 then I just might
145 // have 0x40000000 in the next word down and so I may need to shrink
146 // the number.  Since I handled 1-word bignums specially I have at
147 // least two words to deal with here.
148         if (carry == -1 && (bignum_digits(b)[i-1] & 0x40000000) != 0)
149         {   bignum_digits(b)[i-1] |= ~0x7fffffff;
150             setnumhdr(b, numhdr(b) - pack_hdrlength(1));
151             if (SIXTY_FOUR_BIT)
152             {   if ((i & 1) != 0) bignum_digits(b)[i] = 0;
153                 else *reinterpret_cast<Header *>(&bignum_digits(b)[i]) = make_bighdr(
154                                 2);
155             }
156             else
157             {   if ((i & 1) == 0) bignum_digits(b)[i] = 0;
158                 else *reinterpret_cast<Header *>(&bignum_digits(b)[i]) = make_bighdr(
159                                 2);
160             }
161         }
162         else bignum_digits(b)[i] = carry;   // no shrinking needed
163         return b;
164     }
165 // Here I have overflow: this can only happen when I negate a number
166 // that started off with 0xc0000000 in the most significant digit,
167 // and I have to pad a zero word onto the front.
168     bignum_digits(b)[i] = clear_top_bit(carry);
169     return lengthen_by_one_bit(b, carry);
170 }
171 
172 //
173 // generic negation
174 //
175 
negate(LispObject a)176 LispObject negate(LispObject a)
177 {   switch (static_cast<int>(a) & TAG_BITS)
178     {   case TAG_FIXNUM:
179             if (!SIXTY_FOUR_BIT && is_sfloat(a))
180                 return a ^ 0x80000000U;
181             if (SIXTY_FOUR_BIT && is_sfloat(a))
182                 return a ^ UINT64_C(0x8000000000000000);
183             else return make_lisp_integer64(-int_of_fixnum(a));
184         case TAG_NUMBERS:
185         {   int32_t ha = type_of_header(numhdr(a));
186             switch (ha)
187             {   case TYPE_BIGNUM:
188                     return negateb(a);
189                 case TYPE_RATNUM:
190                 {   LispObject n = numerator(a),
191                                    d = denominator(a);
192                     Save save(d);
193                     n = negate(n);
194                     save.restore(d);
195                     return make_ratio(n, d);
196                 }
197                 case TYPE_COMPLEX_NUM:
198                 {   LispObject r = real_part(a),
199                                    i = imag_part(a);
200                     Save save(i);
201                     r = negate(r);
202                     save.restore(i);
203                     Save save1(r);
204                     i = negate(i);
205                     save1.restore(r);
206                     return make_complex(r, i);
207                 }
208                 default:
209                     return aerror1("bad arg for minus",  a);
210             }
211         }
212         case TAG_BOXFLOAT:
213             switch (type_of_header(flthdr(a)))
214             {   case TYPE_SINGLE_FLOAT:
215                     return make_boxfloat(-single_float_val(a),
216                                          TYPE_SINGLE_FLOAT);
217                 case TYPE_DOUBLE_FLOAT:
218                     return make_boxfloat(-double_float_val(a),
219                                          TYPE_DOUBLE_FLOAT);
220 #ifdef HAVE_SOFTFLOAT
221                 case TYPE_LONG_FLOAT:
222                 {   float128_t aa = long_float_val(a);
223                     f128M_negate(&aa);
224                     return make_boxfloat128(aa);
225                 }
226 #endif // HAVE_SOFTFLOAT
227             }
228         default:
229             return aerror1("bad arg for minus",  a);
230     }
231 }
232 
233 
234 /*****************************************************************************/
235 //**              Transcendental functions etcetera.                       **
236 /*****************************************************************************/
237 
238 
239 //
240 // Much of the code here is extracted from the portable Fortran library
241 // used by Codemist with its Fortran compiler.
242 //
243 
244 //
245 // The object of the following macro is to adjust the floating point
246 // variables concerned so that the more significant one can be squared
247 // with NO LOSS OF PRECISION. It is only used when there is no danger
248 // of over- or under-flow.
249 //
250 // This code is NOT PORTABLE but can be modified for use elsewhere
251 // It should, however, serve for IEEE and IBM FP formats.
252 //
253 
254 typedef union _char_double
255 {   double d;
256     char c[8];
257 } char_double_union;
258 
259 #ifdef LITTLEENDIAN
260 #define LOW_BITS_OFFSET 0
261 #else
262 #define LOW_BITS_OFFSET 4
263 #endif
264 
265 // The code here explictly puns between a double and a row of char values
266 // so that it can force the bottom 32-bits of the represenattion of the
267 // double to be zero. The use of the char type here and then memset to clear
268 // it is intended to keep me safe from strict-aliasing concerns, and modern
269 // C compilers are liable to map the use of memset onto a simple store
270 // instruction.
271 
272 #define _fp_normalize(high, low)                                          \
273     {   char_double_union temp;  /* access to representation     */       \
274         temp.d = high;           /* take original number         */       \
275         std::memset(&temp.c[LOW_BITS_OFFSET], 0, 4);                           \
276                                  /* make low part of mantissa 0  */       \
277         low += (high - temp.d);  /* add into low-order result    */       \
278         high = temp.d;                                                    \
279     }
280 
281 //
282 // A modern C system will provide a datatype "complex double" which
283 // will (I hope) provide direct implementations of some things I need
284 // here. However in the end I may prefer not to use it because for
285 // real floating point I am using crlibm that implements correctly
286 // rounded and hence consistent across all platforms values. If I use
287 // that as a basis for my complex code I will at least get bit-for-bit
288 // identical results everywhere even if I do not manage to achieve
289 // correctly rounded last-bit performance in all cases.
290 // log, sqrt and all the inverse trig functions here need careful review
291 // as to their treatment of -0.0 on branch-cuts!
292 //
293 
Cabs(Complex z)294 double Cabs(Complex z)
295 {
296 //
297 // Obtain the absolute value of a complex number - note that the main
298 // agony here is in ensuring that neither overflow nor underflow can
299 // wreck the calculation.  Given ideal arithmetic the sum could be carried
300 // through as just sqrt(x^2 + y^2).
301 //
302     double x = z.real, y = z.imag;
303     double scale;
304     int n1, n2;
305     if (x==0.0) return std::fabs(y);
306     else if (y==0.0) return std::fabs(x);
307     static_cast<void>(std::frexp(x, &n1));
308     static_cast<void>(std::frexp(y, &n2));
309 // The exact range of values returned by frexp does not matter here
310     if (n2>n1) n1 = n2;
311 // n1 is now the exponent of the larger (in absolute value) of x, y
312     scale = std::ldexp(1.0, n1);     // can not be 0.0
313     x /= scale;
314     y /= scale;
315 // The above scaling operation introduces no rounding error (since the
316 // scale factor is exactly a power of 2). It reduces the larger of x, y
317 // to be somewhere near 1.0 so overflow in x*x+y*y is impossible. It is
318 // still possible that one of x*x and y*y will underflow (but not both)
319 // but this is harmless.
320     return scale * std::sqrt(x*x + y*y);
321 }
322 
Ccos(Complex z)323 Complex Ccos(Complex z)
324 {   double x = z.real, y = z.imag;
325 //
326 // cos(x + iy) = cos(x)*cosh(y) - i sin(x)*sinh(y)
327 // For smallish y this can be used directly.  For |y| > 50 I will
328 // compute sinh and cosh as just +/- exp(|y|)/2
329 //
330     double s = std::sin(x), c = std::cos(x);
331     double absy = std::fabs(y);
332     if (absy <= 50.0)
333     {   double sh = std::sinh(y), ch = std::cosh(y);
334         z.real = c*ch;
335         z.imag = - s*sh;
336         return z;
337     }
338     else
339     {   double w;
340         int n = _reduced_exp(absy, &w) - 1;
341         z.real = std::ldexp(c*w, n);
342         if (y < 0.0) z.imag = std::ldexp(s*w, n);
343         else z.imag = std::ldexp(-s*w, n);
344         return z;
345     }
346 }
347 
reduced_power(double a,int n)348 static double reduced_power(double a, int n)
349 {
350 //
351 // Compute (1 + a)^n - 1 avoiding undue roundoff error.
352 // Assumes n >= 1 on entry and that a is small.
353 //
354     if (n == 1) return a;
355     {   double d = reduced_power(a, n/2);
356         d = (2.0 + d)*d;
357         if (n & 1) d += (1.0 + d)*a;
358         return d;
359     }
360 }
361 
362 //
363 // The following value is included for documentation purposes - it
364 // give the largest args that can be given to exp() without leading to
365 // overflow on IEEE-arithmetic machines.
366 //   #define _exp_arg_limit 709.78271289338397
367 // Note that in any case exp(50.0) will not overflow (it is only 5.2e21),
368 // so it can be evaluated the simple direct way.
369 //
370 
_reduced_exp(double x,double * r)371 int _reduced_exp(double x, double *r)
372 {
373 //
374 // (*r) = exp(x)/2^n;   return n;
375 // where n will be selected so that *r gets set to a fairly small value
376 // (precise range of r unimportant provided it will be WELL away from
377 // chances of overflow, even when exp(x) would actually overflow). This
378 // function may only be called with argument x that is positive and
379 // large enough that n will end up satisfying n>=1.  The coding here
380 // will ensure that if x>4.0, and in general the use of this function
381 // will only be for x > 50.
382 // For IBM hardware it would be good to be able to control the value
383 // of n mod 4, maybe, to help counter wobbling precision.  This is not
384 // done here.
385 //
386     int n;
387     double f;
388     n = static_cast<int>(x / 7.625 + 0.5);
389 //
390 // 7.625 = 61/8 and is expected to have an exact floating point
391 // representation here, so f is computed without any rounding error.
392 // (do I need something like the (x - 0.5) - 0.5 trick here?)
393 //
394     f = std::exp(x - 7.625*static_cast<double>(n));
395 //
396 // the magic constant is ((exp(61/8) / 2048) - 1) and it arises because
397 // 61/88 is a decent rational approximation to log(2), hence exp(61/8)
398 // is almost 2^11.  Thus I compute exp(x) as
399 //   2^(11*n) * (exp(61/8)/2^11)^n * exp(f)
400 // The first factor is exact, the second is (1+e)^n where e is small and
401 // n is an integer, so can be computer accurately, and the residue f at the
402 // end is small enough not to give over-bad trouble.
403 // The numeric constant given here was calculated with the REDUCE 3.3
404 // bigfloat package.
405 //
406 #define _e61q8     3.81086435594567676751e-4
407     *r = reduced_power(_e61q8, n)*f + f;
408 #undef _e61q8
409     return 11*n;
410 }
411 
Cexp(Complex z)412 Complex Cexp(Complex z)
413 {   double x = z.real, y = z.imag;
414 //
415 // value is exp(x)*(cos(y) + i sin(y)) but have care with overflow
416 // Here (and throughout the complex library) there is an opportunity
417 // to save time by computing sin(y) and cos(y) together.  Since this
418 // code is (to begin with) to sit on top of an arbitrary C library,
419 // perhaps with hardware support for the calculation of real-valued
420 // trig functions I am not going to try to realise this saving.
421 //
422     double s = std::sin(y), c = std::cos(y);
423 //
424 // if x > 50 I will use a cautious sceme which computes exp(x) with
425 // its (binary) exponent separated. Note that 50.0 is chosen as a
426 // number noticably smaller than _exp_arg_limit (exp(50) = 5.18e21),
427 // but is not a critical very special number.
428 //
429     if (x <= 50.0)      // includes x < 0.0, of course
430     {   double w = std::exp(x);
431         z.real = w*c;
432         z.imag = w*s;
433         return z;
434     }
435     else
436     {   double w;
437         int n = _reduced_exp(x, &w);
438         z.real = std::ldexp(w*c, n);
439         z.imag = std::ldexp(w*s, n);
440         return z;
441     }
442 }
443 
Cln(Complex z)444 Complex Cln(Complex z)
445 {   double x = z.real, y = z.imag;
446 //
447 // if x and y are both very large then cabs(z) may be out of range
448 // even though log or if is OK.  Thus it is necessary to perform an
449 // elaborate scaled calculation here, and not just
450 //               z.real = log(cabs(z));
451 //
452     double scale, r;
453     int n1, n2;
454     if (x==0.0) r = std::log(std::fabs(y));
455     else if (y==0.0) r = std::log(std::fabs(x));
456     else
457     {   static_cast<void>(std::frexp(x, &n1));
458         static_cast<void>(std::frexp(y, &n2));
459 // The exact range of values returned by frexp does not matter here
460         if (n2>n1) n1 = n2;
461         scale = std::ldexp(1.0, n1);
462         x /= scale;
463         y /= scale;
464         r = std::log(scale) + 0.5*std::log(x*x + y*y);
465     }
466     z.real = r;
467 //
468 // The C standard is not very explicit about the behaviour of atan2(0.0, -n)
469 // while for Fortran it is necessary that this returns +pi not -pi.  Hence
470 // with extreme caution I put a special test here.
471 //
472     if (y == 0.0)
473         if (x < 0.0) z.imag = _pi;
474         else z.imag = 0.0;
475     else z.imag = std::atan2(y, x);
476     return z;
477 }
478 
479 
480 //
481 // Complex raising to a power.  This seems to be pretty nasty
482 // to get right, and the code includes extra high precision variants
483 // on atan() and log().  Further refinements wrt efficiency may be
484 // possible later on.
485 // This code has been partially tested, and seems to be uniformly
486 // better than using just a**b = exp(b*log(a)), but much more careful
487 // study is needed before it can possibly be claimed that it is
488 // right in the sense of not throwing away accuracy when it does not
489 // have to.  I also need to make careful checks to verify that the
490 // correct (principal) value is computed.
491 //
492 
493 //
494 // The next function is used after arithmetic has been done on extra-
495 // precision numbers so that the relationship between high and low parts
496 // is no longer known.  Re-instate it.
497 //
498 
499 #define _two_minus_25 2.98023223876953125e-8 // 2^(-25)
500 
fp_add(double a,double b,double * lowres)501 static double fp_add(double a, double b, double *lowres)
502 {
503 // Result is the high part of a+b, with the low part assigned to *lowres
504     double absa, absb;
505     if (a >= 0.0) absa = a;
506     else absa = -a;
507     if (b >= 0.0) absb = b;
508     else absb = -b;
509     if (absa < absb)
510     {   double t = a; a = b; b = t;
511     }
512 // Now a is the larger (in absolute value) of the two numbers
513     if (absb > absa * _two_minus_25)
514     {   double al = 0.0, bl = 0.0;
515 //
516 // If the exponent difference beweeen a and b is no more than 25 then
517 // I can add the top part (20 or 24 bits) of a to the top part of b
518 // without going beyond the 52 or 56 bits that a full mantissa can hold.
519 //
520         _fp_normalize(a, al);
521         _fp_normalize(b, bl);
522         a = a + b;              // No rounding needed here
523         b = al + bl;
524         if (a == 0.0)
525         {   a = b;
526             b = 0.0;
527         }
528     }
529 //
530 // The above step leaves b small wrt the value in a (unless a+b led
531 // to substantial cancellation of leading digits), but leaves the high
532 // part a with bits everywhere.  Force low part of a to zero.
533 //
534     {   double al = 0.0;
535         _fp_normalize(a, al);
536         b = b + al;
537     }
538     if (a >= 0.0) absa = a;
539     else absa = -a;
540     if (b >= 0.0) absb = b;
541     else absb = -b;
542     if (absb > absa * _two_minus_25)
543 //
544 // If on input a is close to -b, then a+b is close to zero.  In this
545 // case the exponents of a abd b matched, and so earlier calculations
546 // have all been done exactly.  Go around again to split residue into
547 // high and low parts.
548 //
549     {   double al = 0.0, bl = 0.0;
550         _fp_normalize(b, bl);
551         a = a + b;
552         _fp_normalize(a, al);
553         b = bl + al;
554     }
555     *lowres = b;
556     return a;
557 }
558 
559 #undef _two_minus_25
560 
extended_atan2(double b,double a,double * thetah,double * thetal)561 static void extended_atan2(double b, double a, double *thetah,
562                            double *thetal)
563 {   int octant;
564     double rh, rl, thh, thl;
565 //
566 // First reduce the argument to the first octant (i.e. a, b both +ve,
567 // and b <= a).
568 //
569     if (b < 0.0)
570     {   octant = 4;
571         a = -a;
572         b = -b;
573     }
574     else octant = 0;
575     if (a < 0.0)
576     {   double t = a;
577         octant += 2;
578         a = b;
579         b = -t;
580     }
581     if (b > a)
582     {   double t = a;
583         octant += 1;
584         a = b;
585         b = t;
586     }
587 
588     {   static struct
589         {   double h;
590             double l;
591         } _atan[] =
592         {
593 //
594 // The table here gives atan(n/16) in 1.5-precision for n=0..16
595 // Note that all the magic numbers used in this file were calculated
596 // using the REDUCE bigfloat package, and all the 'exact' parts have
597 // a denominator of at worst 2^26 and so are expected to have lots
598 // of trailing zero bits in their floating point representation.
599 //
600             { 0.0,                        0.0 },
601             { 0.06241881847381591796875,  -0.84778585694947708870e-8 },
602             { 0.124355018138885498046875, -2.35921240630155201508e-8 },
603             { 0.185347974300384521484375, -2.43046897565983490387e-8 },
604             { 0.2449786663055419921875,   -0.31786778380154175187e-8 },
605             { 0.302884876728057861328125, -0.83530864557675689054e-8 },
606             { 0.358770668506622314453125,  0.17639499059427950639e-8 },
607             { 0.412410438060760498046875,  0.35366268088529162896e-8 },
608             { 0.4636476039886474609375,    0.50121586552767562314e-8 },
609             { 0.512389481067657470703125, -2.07569197640365239794e-8 },
610             { 0.558599293231964111328125,  2.21115983246433832164e-8 },
611             { 0.602287352085113525390625, -0.59501493437085023057e-8 },
612             { 0.643501102924346923828125,  0.58689374629746842287e-8 },
613             { 0.6823165416717529296875,    1.32029951485689299817e-8 },
614             { 0.71882998943328857421875,   1.01883359311982641515e-8 },
615             { 0.75315129756927490234375,  -1.66070805128190106297e-8 },
616             { 0.785398185253143310546875, -2.18556950009312141541e-8 }
617         };
618         int k = static_cast<int>(16.0*(b/a + 0.03125)); // 0 to 16
619         double kd = static_cast<double>(k)/16.0;
620         double ah = a, al = 0.0,
621                bh = b, bl = 0.0,
622                ch, cl, q, q2;
623         _fp_normalize(ah, al);
624         _fp_normalize(bh, bl);
625         ch = bh - ah*kd; cl = bl - al*kd;
626         ah = ah + bh*kd; al = al + bl*kd;
627         bh = ch; bl = cl;
628 // Now |(a/b)| <= 1/32
629         ah = fp_add(ah, al, &al);       // Re-normalise
630         bh = fp_add(bh, bl, &bl);
631 // Compute approximation to b/a
632         rh = (bh + bl)/(ah + al); rl = 0.0;
633         _fp_normalize(rh, rl);
634         bh -= ah*rh; bl -= al*rh;
635         rl = (bh + bl)/(ah + al);   // Quotient now formed
636 //
637 // Now it is necessary to compute atan(q) to one-and-a-half precision.
638 // Since |q| < 1/32 I will leave just q as the high order word of
639 // the result and compute atan(q)-q as a single precision value. This
640 // gives about 16 bits accuracy beyond regular single precision work.
641 //
642         q = rh + rl;
643         q2 = q*q;
644 // The expansion the follows could be done better using a minimax poly
645         rl -= q*q2*(0.33333333333333333333 -
646                     q2*(0.20000000000000000000 -
647                         q2*(0.14285714285714285714 -
648                             q2*(0.11111111111111111111 -
649                                 q2* 0.09090909090909090909))));
650 // OK - now (rh, rl) is atan(reduced b/a).  Need to add on atan(kd)
651         rh += _atan[k].h;
652         rl += _atan[k].l;
653     }
654 //
655 // The following constants give high precision versions of pi and pi/2,
656 // and the high partwords (p2h and pih) have lots of low order zero bits
657 // in their binary representation.  Expect (=require) that the arithmetic
658 // that computes thh is done without introduced rounding error.
659 //
660 #define _p2h    1.57079632580280303955078125
661 #define _p2l    9.92093579680540441639751e-10
662 
663 #define _pih    3.14159265160560607910156250
664 #define _pil    1.984187159361080883279502e-9
665 
666     switch (octant)
667 {       default:
668         case 0: thh = rh;          thl = rl;          break;
669         case 1: thh = _p2h - rh;   thl = _p2l - rl;   break;
670         case 2: thh = _p2h + rh;   thl = _p2l + rl;   break;
671         case 3: thh = _pih - rh;   thl = _pil - rl;   break;
672         case 4: thh = -_pih + rh;  thl = -_pil + rl;  break;
673         case 5: thh = -_p2h - rh;  thl = -_p2l - rl;  break;
674         case 6: thh = -_p2h + rh;  thl = -_p2l + rl;  break;
675         case 7: thh = -rh;         thl = -rl;         break;
676     }
677 
678 #undef _p2h
679 #undef _p2l
680 #undef _pih
681 #undef _pil
682 
683     *thetah = fp_add(thh, thl, thetal);
684 }
685 
extended_log(int k,double a,double b,double * logrh,double * logrl)686 static void extended_log(int k, double a, double b,
687                          double *logrh, double *logrl)
688 {
689 //
690 // If we had exact arithmetic this procedure could be:
691 //      k*log(2) + 0.5*log(a^2 + b^2)
692 //
693     double al = 0.0, bl = 0.0, all = 0.0, bll = 0.0, c, ch, cl, cll;
694     double w, q, qh, ql, rh, rl;
695     int n;
696 //
697 // First (a^2 + b^2) is calculated, using extra precision.
698 // Because any rounding at this stage can lead to bad errors in
699 // the power that I eventually want to compute, I use 3-word
700 // arithmetic here, and with the version of _fp_normalize given
701 // above and IEEE or IBM370 arithmetic this part of the
702 // computation is exact.
703 //
704     _fp_normalize(a, al); _fp_normalize(al, all);
705     _fp_normalize(b, bl); _fp_normalize(bl, bll);
706     ch = a*a + b*b;
707     cl = 2.0*(a*al + b*bl);
708     cll = (al*al + bl*bl) +
709           all*(2.0*(a + al) + all) +
710           bll*(2.0*(b + bl) + bll);
711     _fp_normalize(ch, cl);
712     _fp_normalize(cl, cll);
713     c = ch + (cl + cll);        // single precision approximation
714 //
715 // At this stage the scaling of the input value will mean that we
716 // have 0.25 <= c <= 2.0
717 //
718 // Now rewrite things as
719 //      (2*k + n)*log(s) + 0.5*log((a^2 + b^2)/2^n))
720 //           where s = sqrt(2)
721 // and where the arg to the log is in sqrt(0.5), sqrt(2)
722 //
723 
724 #define _sqrt_half 0.70710678118654752440
725 #define _sqrt_two  1.41421356237309504880
726     k = 2*k;
727     while (c < _sqrt_half)
728     {   k   -= 1;
729         ch  *= 2.0;
730         cl  *= 2.0;
731         cll *= 2.0;
732         c   *= 2.0;
733     }
734     while (c > _sqrt_two)
735     {   k   += 1;
736         ch  *= 0.5;
737         cl  *= 0.5;
738         cll *= 0.5;
739         c   *= 0.5;
740     }
741 #undef _sqrt_half
742 #undef _sqrt_two
743     n = static_cast<int>(16.0/c + 0.5);
744     w = static_cast<double>(n) / 16.0;
745     ch *= w;
746     cl *= w;
747     cll *= w;           // Now |c-1| < 0.04317
748     ch = (ch - 0.5) - 0.5;
749     ch = fp_add(ch, cl, &cl);
750     cl = cl + cll;
751 //
752 // (ch, cl) is now the reduced argument ready for calculating log(1+c),
753 // and now that the reduction is over I can drop back to the use of just
754 // two doubleprecision values to represent c.
755 //
756     c = ch + cl;
757     qh = c / (2.0 + c);
758     ql = 0.0;
759     _fp_normalize(qh, ql);
760     ql = ((ch - qh*(2.0 + ch)) + cl - qh*cl) / (2.0 + c);
761 // (qh, ql) is now c/(2.0 + c)
762     rh = qh;  // 18 bits bigger than low part will end up
763     q = qh + ql;
764     w = q*q;
765     rl = ql + q*w*(0.33333333333333333333 +
766                    w*(0.20000000000000000000 +
767                       w*(0.14285714285714285714 +
768                          w*(0.11111111111111111111 +
769                             w*(0.09090909090909090909)))));
770 //
771 // (rh, rl) is now atan(c) correct to double precision plus about 18 bits.
772 //
773 
774     {   double temp;
775         static struct
776         {   double h;
777             double l;
778         } _log_table[] =
779         {
780 //
781 // The following values are (in extra precision) -log(n/16)/2 for n
782 // in the range 11 to 23 (i.e. roughly over sqrt(2)/2 to sqrt(2))
783 //
784             {  0.1873466968536376953125,    2.786706765149099245e-8 },
785             {  0.1438410282135009765625,    0.801238948715710950e-8 },
786             {  0.103819668292999267578125,  1.409612298322959552e-8 },
787             {  0.066765725612640380859375, -2.930037906928620319e-8 },
788             {  0.0322692394256591796875,    2.114312640614896196e-8 },
789             {  0.0,                         0.0                     },
790             { -0.0303122997283935546875,   -1.117982386660280307e-8 },
791             { -0.0588915348052978515625,    1.697710612429310295e-8 },
792             { -0.08592510223388671875,     -2.622944289242004947e-8 },
793             { -0.111571788787841796875,     1.313073691899185245e-8 },
794             { -0.135966837406158447265625, -2.033566243215020975e-8 },
795             { -0.159226894378662109375,     2.881939480146987639e-8 },
796             { -0.18145275115966796875,      0.431498374218108783e-8 }
797         };
798 
799         rh = fp_add(rh, _log_table[n-11].h, &temp);
800         rl = temp + _log_table[n-11].l + rl;
801     }
802 
803 #define _exact_part_logroot2    0.3465735912322998046875
804 #define _approx_part_logroot2   (-9.5232714997888393927e-10)
805 // Multiply this by k and add it in
806     {   double temp, kd = static_cast<double>(k);
807         rh = fp_add(rh, kd*_exact_part_logroot2, &temp);
808         rl = rl + temp + kd*_approx_part_logroot2;
809     }
810 #undef _exact_part_logroot2
811 #undef _approx_part_logroot2
812 
813     *logrh = rh;
814     *logrl = rl;
815     return;
816 }
817 
818 
Cpow(Complex z1,Complex z2)819 Complex Cpow(Complex z1, Complex z2)
820 {   double a = z1.real, b = z1.imag,
821                c = z2.real, d = z2.imag;
822     int k, m, n;
823     double scale;
824     double logrh, logrl, thetah, thetal;
825     double r, i, rh, rl, ih, il, clow, dlow, q;
826     double cw, sw, cost, sint;
827 
828     if (b == 0.0 && d == 0.0 &&
829         a >= 0.0)// Simple case if both args are real
830     {   z1.real = std::pow(a, c);
831         z1.imag = 0.0;
832         return z1;
833     }
834 //
835 // Start by scaling z1 by dividing out a power of 2 so that |z1| is
836 // fairly close to 1
837 //
838     if (a == 0.0)
839     {   if (b == 0.0) return z1;    // 0.0**anything is really an error
840 // The exact values returned by frexp do not matter here
841         static_cast<void>(std::frexp(b, &k));
842     }
843     else
844     {   static_cast<void>(std::frexp(a, &k));
845         if (b != 0.0)
846         {   int n;
847             static_cast<void>(std::frexp(b, &n));
848             if (n > k) k = n;
849         }
850     }
851     scale = std::ldexp(1.0, k);
852     a /= scale;
853     b /= scale;
854 //
855 // The idea next is to express z1 as r*exp(i theta), then z1**z2
856 // is exp(z2*log(z1)) and the exponent simplifies to
857 //             (c*log r - d*theta) + i(c theta + d log r).
858 // Note r = scale*sqrt(a*a + b*b) now.
859 // The argument for exp() must be computed to noticably more than
860 // regular precision, since otherwise exp() greatly magnifies the
861 // error in the real part (if it is large and positive) and range
862 // reduction in the imaginary part can equally lead to accuracy
863 // problems.  Neither c nor d can usefully be anywhere near the
864 // extreme range of floating point values, so I will not even try
865 // to scale them, thus I will end up happy(ish) if I can compute
866 // atan2(b, a) and log(|z1|) in one-and-a-half precision form.
867 // It would be best to compute theta in units of pi/4 rather than in
868 // raidians, since then the case of z^n (integer n) with arg(z) an
869 // exact multiple of pi/4 would work out better. However at present I
870 // just hope that the 1.5-precision working is good enough.
871 //
872     extended_atan2(b, a, &thetah, &thetal);
873     extended_log(k, a, b, &logrh, &logrl);
874 
875 // Normalise all numbers
876     clow = 0.0;
877     dlow = 0.0;
878     _fp_normalize(c, clow);
879     _fp_normalize(d, dlow);
880 
881 //  (rh, rl) = c*logr - d*theta;
882     rh = c*logrh - d*thetah;  // No rounding in this computation
883     rl = c*logrl + clow*(logrh + logrl) - d*thetal - dlow*
884          (thetah + thetal);
885 
886 //  (ih, il) = c*theta + d*logr;
887     ih = c*thetah + d*logrh;  // No rounding in this computation
888     il = c*thetal + clow*(thetah + thetal) + d*logrl + dlow*
889          (logrh + logrl);
890 
891 //
892 // Now it remains to take the exponential of the extended precision
893 // value in ((rh, rl), (ih, il)).
894 // Range reduce the real part by multiples of log(2), and the imaginary
895 // part by multiples of pi/2.
896 //
897 
898     rh = fp_add(rh, rl, &rl);   // renormalise
899     r = rh + rl;                // Approximate value
900 
901 #define _recip_log_2 1.4426950408889634074
902 
903     q = r * _recip_log_2;
904     m = (q < 0.0) ? static_cast<int>(q - 0.5) : static_cast<int>(q + 0.5);
905     q = static_cast<double>(m);
906 #undef _recip_log_2
907 //
908 // log 2 = 11629080/2^24  - 0.000 00000 19046 54299 95776 78785
909 // to reasonable accuracy.  It is vital that the exact part be
910 // read in as a number with lots of low zero bits, so when it gets
911 // multiplied by the integer q there is NO rounding needed.
912 //
913 
914 #define _exact_part_log2    0.693147182464599609375
915 #define _approx_part_log2 (-1.9046542999577678785418e-9)
916     rh = rh - q*_exact_part_log2;
917     rl = rl - q*_approx_part_log2;
918 #undef _exact_part_log2
919 #undef _approx_part_log2
920     r = std::exp(rh + rl);        // This should now be accurate enough
921 
922     ih = fp_add(ih, il, &il);
923     i = ih + il;        // Approximate value
924 
925 #define _recip_pi_by_2 0.6366197723675813431
926 
927     q = i * _recip_pi_by_2;
928     n = (q < 0.0) ? static_cast<int>(q - 0.5) : static_cast<int>(q + 0.5);
929     q = static_cast<double>(n);
930 //
931 // pi/2 = 105414357/2^26 + 0.000 00000 09920 93579 68054 04416 39751
932 // to reasonable accuracy.  It is vital that the exact part be
933 // read in as a number with lots of low zero bits, so when it gets
934 // multiplied by the integer q there is NO rounding needed.
935 //
936 
937 #define _exact_part_pi2    1.57079632580280303955078125
938 #define _approx_part_pi2   9.92093579680540441639751e-10
939     ih = ih - q*_exact_part_pi2;
940     il = il - q*_approx_part_pi2;
941 #undef _recip_pi_by_2
942 #undef _exact_part_pi2
943 #undef _approx_part_pi2
944     i = ih + il;            // Now accurate enough
945 //
946 // Having done super-careful range reduction I can call the regular
947 // sin/cos routines here.  If sin/cos could both be computed together
948 // that could speed things up a little bit, but by this stage they have
949 // not much in common.
950 //
951     cw = std::cos(i);
952     sw = std::sin(i);
953     switch (n & 3)  // quadrant control
954 {       default:
955         case 0: cost = cw;  sint = sw;  break;
956         case 1: cost = -sw; sint = cw;  break;
957         case 2: cost = -cw; sint = -sw; break;
958         case 3: cost = sw;  sint = -cw; break;
959     }
960 
961 //
962 // Now, at long last, I can assemble the results and return.
963 //
964 
965     z1.real = std::ldexp(r*cost, m);
966     z1.imag = std::ldexp(r*sint, m);
967     return z1;
968 }
969 
970 //
971 // End of complex-to-complex-power code.
972 //
973 
974 // end of arith07.cpp
975