1 /***********************************************************************************/
2 /** MIT License **/
3 /** ----------- **/
4 /** **/
5 /** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/
6 /** **/
7 /** Permission is hereby granted, free of charge, to any person obtaining a copy **/
8 /** of this Software and associated documentaon files (the "Software"), to deal **/
9 /** in the Software without restriction, including without limitation the rights **/
10 /** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/
11 /** copies of the Software, and to permit persons to whom the Software is **/
12 /** furnished to do so, subject to the following conditions: **/
13 /** **/
14 /** The above copyright notice and this permission notice shall be included in **/
15 /** all copies or substantial portions of the Software. **/
16 /** **/
17 /** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/
18 /** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/
19 /** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/
20 /** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/
21 /** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/
22 /** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/
23 /** THE SOFTWARE. **/
24 /***********************************************************************************/
25 
26 #ifndef LIBM_INLINES_AMD_H_INCLUDED
27 #define LIBM_INLINES_AMD_H_INCLUDED 1
28 
29 #include "libm_util.h"
30 
31 /* Set defines for inline functions calling other inlines */
32 #if defined(USE_VAL_WITH_FLAGS) || defined(USE_VALF_WITH_FLAGS) || \
33     defined(USE_ZERO_WITH_FLAGS) || defined(USE_ZEROF_WITH_FLAGS) || \
34     defined(USE_NAN_WITH_FLAGS) || defined(USE_NANF_WITH_FLAGS) || \
35     defined(USE_INDEFINITE_WITH_FLAGS) || defined(USE_INDEFINITEF_WITH_FLAGS) || \
36     defined(USE_INFINITY_WITH_FLAGS) || defined(USE_INFINITYF_WITH_FLAGS) || \
37     defined(USE_SQRT_AMD_INLINE) || defined(USE_SQRTF_AMD_INLINE) || \
38     (defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF))
39 #undef USE_RAISE_FPSW_FLAGS
40 #define USE_RAISE_FPSW_FLAGS 1
41 #endif
42 
43 #if defined(USE_SPLITDOUBLE)
44 /* Splits double x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0.
45    Assumes that x is not zero, denormal, infinity or NaN, but these conditions
46    are not checked */
splitDouble(double x,int * e,double * m)47 static inline void splitDouble(double x, int *e, double *m)
48 {
49   unsigned long long ux, uy;
50   GET_BITS_DP64(x, ux);
51   uy = ux;
52   ux &= EXPBITS_DP64;
53   ux >>= EXPSHIFTBITS_DP64;
54   *e = (int)ux - EXPBIAS_DP64 + 1;
55   uy = (uy & (SIGNBIT_DP64 | MANTBITS_DP64)) | HALFEXPBITS_DP64;
56   PUT_BITS_DP64(uy, x);
57   *m = x;
58 }
59 #endif /* USE_SPLITDOUBLE */
60 
61 
62 #if defined(USE_SPLITDOUBLE_2)
63 /* Splits double x into exponent e and mantissa m, where 1.0 <= abs(m) < 4.0.
64    Assumes that x is not zero, denormal, infinity or NaN, but these conditions
65    are not checked. Also assumes EXPBIAS_DP is odd. With this
66    assumption, e will be even on exit. */
splitDouble_2(double x,int * e,double * m)67 static inline void splitDouble_2(double x, int *e, double *m)
68 {
69   unsigned long long ux, vx;
70   GET_BITS_DP64(x, ux);
71   vx = ux;
72   ux &= EXPBITS_DP64;
73   ux >>= EXPSHIFTBITS_DP64;
74   if (ux & 1)
75     {
76       /* The exponent is odd */
77       vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | ONEEXPBITS_DP64;
78       PUT_BITS_DP64(vx, x);
79       *m = x;
80       *e = ux - EXPBIAS_DP64;
81     }
82   else
83     {
84       /* The exponent is even */
85       vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | TWOEXPBITS_DP64;
86       PUT_BITS_DP64(vx, x);
87       *m = x;
88       *e = ux - EXPBIAS_DP64 - 1;
89     }
90 }
91 #endif /* USE_SPLITDOUBLE_2 */
92 
93 
94 #if defined(USE_SPLITFLOAT)
95 /* Splits float x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0.
96    Assumes that x is not zero, denormal, infinity or NaN, but these conditions
97    are not checked */
splitFloat(float x,int * e,float * m)98 static inline void splitFloat(float x, int *e, float *m)
99 {
100   unsigned int ux, uy;
101   GET_BITS_SP32(x, ux);
102   uy = ux;
103   ux &= EXPBITS_SP32;
104   ux >>= EXPSHIFTBITS_SP32;
105   *e = (int)ux - EXPBIAS_SP32 + 1;
106   uy = (uy & (SIGNBIT_SP32 | MANTBITS_SP32)) | HALFEXPBITS_SP32;
107   PUT_BITS_SP32(uy, x);
108   *m = x;
109 }
110 #endif /* USE_SPLITFLOAT */
111 
112 
113 #if defined(USE_SCALEDOUBLE_1)
114 /* Scales the double x by 2.0**n.
115    Assumes EMIN <= n <= EMAX, though this condition is not checked. */
scaleDouble_1(double x,int n)116 static inline double scaleDouble_1(double x, int n)
117 {
118   double t;
119   /* Construct the number t = 2.0**n */
120   PUT_BITS_DP64(((long long)n + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t);
121   return x*t;
122 }
123 #endif /* USE_SCALEDOUBLE_1 */
124 
125 
126 #if defined(USE_SCALEDOUBLE_2)
127 /* Scales the double x by 2.0**n.
128    Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */
scaleDouble_2(double x,int n)129 static inline double scaleDouble_2(double x, int n)
130 {
131   double t1, t2;
132   int n1, n2;
133   n1 = n / 2;
134   n2 = n - n1;
135   /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */
136   PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1);
137   PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2);
138   return (x*t1)*t2;
139 }
140 #endif /* USE_SCALEDOUBLE_2 */
141 
142 
143 #if defined(USE_SCALEDOUBLE_3)
144 /* Scales the double x by 2.0**n.
145    Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */
scaleDouble_3(double x,int n)146 static inline double scaleDouble_3(double x, int n)
147 {
148   double t1, t2, t3;
149   int n1, n2, n3;
150   n1 = n / 3;
151   n2 = (n - n1) / 2;
152   n3 = n - n1 - n2;
153   /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */
154   PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1);
155   PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2);
156   PUT_BITS_DP64(((long long)n3 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t3);
157   return ((x*t1)*t2)*t3;
158 }
159 #endif /* USE_SCALEDOUBLE_3 */
160 
161 
162 #if defined(USE_SCALEFLOAT_1)
163 /* Scales the float x by 2.0**n.
164    Assumes EMIN <= n <= EMAX, though this condition is not checked. */
scaleFloat_1(float x,int n)165 static inline float scaleFloat_1(float x, int n)
166 {
167   float t;
168   /* Construct the number t = 2.0**n */
169   PUT_BITS_SP32((n + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t);
170   return x*t;
171 }
172 #endif /* USE_SCALEFLOAT_1 */
173 
174 
175 #if defined(USE_SCALEFLOAT_2)
176 /* Scales the float x by 2.0**n.
177    Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */
scaleFloat_2(float x,int n)178 static inline float scaleFloat_2(float x, int n)
179 {
180   float t1, t2;
181   int n1, n2;
182   n1 = n / 2;
183   n2 = n - n1;
184   /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */
185   PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1);
186   PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2);
187   return (x*t1)*t2;
188 }
189 #endif /* USE_SCALEFLOAT_2 */
190 
191 
192 #if defined(USE_SCALEFLOAT_3)
193 /* Scales the float x by 2.0**n.
194    Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */
scaleFloat_3(float x,int n)195 static inline float scaleFloat_3(float x, int n)
196 {
197   float t1, t2, t3;
198   int n1, n2, n3;
199   n1 = n / 3;
200   n2 = (n - n1) / 2;
201   n3 = n - n1 - n2;
202   /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */
203   PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1);
204   PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2);
205   PUT_BITS_SP32((n3 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t3);
206   return ((x*t1)*t2)*t3;
207 }
208 #endif /* USE_SCALEFLOAT_3 */
209 
210 #if defined(USE_SETPRECISIONDOUBLE)
setPrecisionDouble(void)211 unsigned int setPrecisionDouble(void)
212 {
213   unsigned int cw, cwold = 0;
214   /* There is no precision control on Hammer */
215   return cwold;
216 }
217 #endif /* USE_SETPRECISIONDOUBLE */
218 
219 #if defined(USE_RESTOREPRECISION)
restorePrecision(unsigned int cwold)220 void restorePrecision(unsigned int cwold)
221 {
222   /* There is no precision control on Hammer */
223   return;
224 }
225 #endif /* USE_RESTOREPRECISION */
226 
227 
228 
229 
230 #if defined(USE_RAISE_FPSW_FLAGS)
231 /* Raises floating-point status flags. The argument should be
232    the bitwise or of the flags to be raised, from the
233    list above, e.g.
234      raise_fpsw_flags(AMD_F_INEXACT | AMD_F_INVALID);
235  */
236 
237 /* ISSUE - wat - 08182010
238  * These AMD_ISW_* flags are duplicated from trans.h
239  * this is not clean; Mark S. did it for targeted fix of 855457
240  * Eliminate all redundant flags in the next overhaul
241  */
242 
243 #define AMD_ISW_INVALID     0x0001
244 #define AMD_ISW_DENORMAL    0x0002
245 #define AMD_ISW_ZERODIVIDE  0x0004
246 #define AMD_ISW_OVERFLOW    0x0008
247 #define AMD_ISW_UNDERFLOW   0x0010
248 #define AMD_ISW_INEXACT     0x0020
249 
250 /* use this function from fpctrl.c */
251 void _set_statfp(uintptr_t);
252 
raise_fpsw_flags(int flags)253 static inline void raise_fpsw_flags(int flags)
254 {
255     unsigned int f = 0;
256 
257     if (flags & AMD_F_OVERFLOW)     { f |= AMD_ISW_OVERFLOW; }
258     if (flags & AMD_F_UNDERFLOW)    { f |= AMD_ISW_UNDERFLOW; }
259     if (flags & AMD_F_DIVBYZERO)    { f |= AMD_ISW_ZERODIVIDE; }
260     if (flags & AMD_F_INVALID)      { f |= AMD_ISW_INVALID; }
261     if (flags & AMD_F_INEXACT)      { f |= AMD_ISW_INEXACT; }
262 
263     _set_statfp(f);
264 }
265 
266 #endif /* USE_RAISE_FPSW_FLAGS */
267 
268 
269 #if defined(USE_GET_FPSW_INLINE)
270 /* Return the current floating-point status word */
get_fpsw_inline(void)271 static inline unsigned int get_fpsw_inline(void)
272 {
273   return _mm_getcsr();
274 }
275 #endif /* USE_GET_FPSW_INLINE */
276 
277 #if defined(USE_SET_FPSW_INLINE)
278 /* Set the floating-point status word */
set_fpsw_inline(unsigned int sw)279 static inline void set_fpsw_inline(unsigned int sw)
280 {
281   _mm_setcsr(sw);
282 }
283 #endif /* USE_SET_FPSW_INLINE */
284 
285 
286 
287 #if defined(USE_VAL_WITH_FLAGS)
288 /* Returns a double value after raising the given flags,
289   e.g.  val_with_flags(AMD_F_INEXACT);
290  */
val_with_flags(double val,int flags)291 static inline double val_with_flags(double val, int flags)
292 {
293   raise_fpsw_flags(flags);
294   return val;
295 }
296 #endif /* USE_VAL_WITH_FLAGS */
297 
298 #if defined(USE_VALF_WITH_FLAGS)
299 /* Returns a float value after raising the given flags,
300   e.g.  valf_with_flags(AMD_F_INEXACT);
301  */
valf_with_flags(float val,int flags)302 static inline float valf_with_flags(float val, int flags)
303 {
304   raise_fpsw_flags(flags);
305   return val;
306 }
307 #endif /* USE_VALF_WITH_FLAGS */
308 
309 
310 #if defined(USE_ZERO_WITH_FLAGS)
311 /* Returns a double +zero after raising the given flags,
312   e.g.  zero_with_flags(AMD_F_INEXACT | AMD_F_INVALID);
313  */
zero_with_flags(int flags)314 static inline double zero_with_flags(int flags)
315 {
316   raise_fpsw_flags(flags);
317   return 0.0;
318 }
319 #endif /* USE_ZERO_WITH_FLAGS */
320 
321 
322 #if defined(USE_ZEROF_WITH_FLAGS)
323 /* Returns a float +zero after raising the given flags,
324   e.g.  zerof_with_flags(AMD_F_INEXACT | AMD_F_INVALID);
325  */
zerof_with_flags(int flags)326 static inline float zerof_with_flags(int flags)
327 {
328   raise_fpsw_flags(flags);
329   return 0.0F;
330 }
331 #endif /* USE_ZEROF_WITH_FLAGS */
332 
333 
334 #if defined(USE_NAN_WITH_FLAGS)
335 /* Returns a double quiet +nan after raising the given flags,
336    e.g.  nan_with_flags(AMD_F_INVALID);
337 */
nan_with_flags(int flags)338 static inline double nan_with_flags(int flags)
339 {
340   double z;
341   raise_fpsw_flags(flags);
342   PUT_BITS_DP64(0x7ff8000000000000, z);
343   return z;
344 }
345 #endif /* USE_NAN_WITH_FLAGS */
346 
347 #if defined(USE_NANF_WITH_FLAGS)
348 /* Returns a float quiet +nan after raising the given flags,
349    e.g.  nanf_with_flags(AMD_F_INVALID);
350 */
nanf_with_flags(int flags)351 static inline float nanf_with_flags(int flags)
352 {
353   float z;
354   raise_fpsw_flags(flags);
355   PUT_BITS_SP32(0x7fc00000, z);
356   return z;
357 }
358 #endif /* USE_NANF_WITH_FLAGS */
359 
360 
361 #if defined(USE_INDEFINITE_WITH_FLAGS)
362 /* Returns a double indefinite after raising the given flags,
363    e.g.  indefinite_with_flags(AMD_F_INVALID);
364 */
indefinite_with_flags(int flags)365 static inline double indefinite_with_flags(int flags)
366 {
367   double z;
368   raise_fpsw_flags(flags);
369   PUT_BITS_DP64(0xfff8000000000000, z);
370   return z;
371 }
372 #endif /* USE_INDEFINITE_WITH_FLAGS */
373 
374 #if defined(USE_INDEFINITEF_WITH_FLAGS)
375 /* Returns a float quiet +indefinite after raising the given flags,
376    e.g.  indefinitef_with_flags(AMD_F_INVALID);
377 */
indefinitef_with_flags(int flags)378 static inline float indefinitef_with_flags(int flags)
379 {
380   float z;
381   raise_fpsw_flags(flags);
382   PUT_BITS_SP32(0xffc00000, z);
383   return z;
384 }
385 #endif /* USE_INDEFINITEF_WITH_FLAGS */
386 
387 
388 #ifdef USE_INFINITY_WITH_FLAGS
389 /* Returns a positive double infinity after raising the given flags,
390    e.g.  infinity_with_flags(AMD_F_OVERFLOW);
391 */
infinity_with_flags(int flags)392 static inline double infinity_with_flags(int flags)
393 {
394   double z;
395   raise_fpsw_flags(flags);
396   PUT_BITS_DP64((unsigned long long)(BIASEDEMAX_DP64 + 1) << EXPSHIFTBITS_DP64, z);
397   return z;
398 }
399 #endif /* USE_INFINITY_WITH_FLAGS */
400 
401 #ifdef USE_INFINITYF_WITH_FLAGS
402 /* Returns a positive float infinity after raising the given flags,
403    e.g.  infinityf_with_flags(AMD_F_OVERFLOW);
404 */
infinityf_with_flags(int flags)405 static inline float infinityf_with_flags(int flags)
406 {
407   float z;
408   raise_fpsw_flags(flags);
409   PUT_BITS_SP32((BIASEDEMAX_SP32 + 1) << EXPSHIFTBITS_SP32, z);
410   return z;
411 }
412 #endif /* USE_INFINITYF_WITH_FLAGS */
413 
414 #if defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF)
415 #include <errno.h>
416 #endif
417 
418 /* define the Microsoft specific error handling routine */
419 double _handle_error(
420         char *fname,
421         int opcode,
422         unsigned long long value,
423         int type,
424         int flags,
425         int error,
426         double arg1,
427         double arg2,
428         int nargs
429         );
430 float _handle_errorf(
431         char *fname,
432         int opcode,
433         unsigned long long value,
434         int type,
435         int flags,
436         int error,
437         float arg1,
438         float arg2,
439         int nargs
440         );
441 
442 #if defined(USE_SPLITEXP)
443 /* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2).
444    Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments
445    abs(x) > large/(ln(base)) (where large is the largest representable
446    floating point number) should be handled separately instead of calling
447    this function. This function is called by exp, exp2, exp10,
448    cosh and sinh. */
splitexp(double x,double logbase,double thirtytwo_by_logbaseof2,double logbaseof2_by_32_lead,double logbaseof2_by_32_trail,int * m,double * z1,double * z2)449 static inline void splitexp(double x, double logbase,
450                             double thirtytwo_by_logbaseof2,
451                             double logbaseof2_by_32_lead,
452                             double logbaseof2_by_32_trail,
453                             int *m, double *z1, double *z2)
454 {
455   double q, r, r1, r2, f1, f2;
456   int n, j;
457 
458 /* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain
459    leading and trailing parts respectively of precomputed
460    values of pow(2.0,j/32.0), for j = 0, 1, ..., 31.
461    two_to_jby32_lead_table contains the first 25 bits of precision,
462    and two_to_jby32_trail_table contains a further 53 bits precision. */
463 
464   static const double two_to_jby32_lead_table[32] = {
465     1.00000000000000000000e+00,   /* 0x3ff0000000000000 */
466     1.02189713716506958008e+00,   /* 0x3ff059b0d0000000 */
467     1.04427373409271240234e+00,   /* 0x3ff0b55860000000 */
468     1.06714040040969848633e+00,   /* 0x3ff11301d0000000 */
469     1.09050768613815307617e+00,   /* 0x3ff172b830000000 */
470     1.11438673734664916992e+00,   /* 0x3ff1d48730000000 */
471     1.13878858089447021484e+00,   /* 0x3ff2387a60000000 */
472     1.16372483968734741211e+00,   /* 0x3ff29e9df0000000 */
473     1.18920707702636718750e+00,   /* 0x3ff306fe00000000 */
474     1.21524733304977416992e+00,   /* 0x3ff371a730000000 */
475     1.24185776710510253906e+00,   /* 0x3ff3dea640000000 */
476     1.26905095577239990234e+00,   /* 0x3ff44e0860000000 */
477     1.29683953523635864258e+00,   /* 0x3ff4bfdad0000000 */
478     1.32523661851882934570e+00,   /* 0x3ff5342b50000000 */
479     1.35425549745559692383e+00,   /* 0x3ff5ab07d0000000 */
480     1.38390988111495971680e+00,   /* 0x3ff6247eb0000000 */
481     1.41421353816986083984e+00,   /* 0x3ff6a09e60000000 */
482     1.44518077373504638672e+00,   /* 0x3ff71f75e0000000 */
483     1.47682613134384155273e+00,   /* 0x3ff7a11470000000 */
484     1.50916439294815063477e+00,   /* 0x3ff8258990000000 */
485     1.54221081733703613281e+00,   /* 0x3ff8ace540000000 */
486     1.57598084211349487305e+00,   /* 0x3ff93737b0000000 */
487     1.61049032211303710938e+00,   /* 0x3ff9c49180000000 */
488     1.64575546979904174805e+00,   /* 0x3ffa5503b0000000 */
489     1.68179279565811157227e+00,   /* 0x3ffae89f90000000 */
490     1.71861928701400756836e+00,   /* 0x3ffb7f76f0000000 */
491     1.75625211000442504883e+00,   /* 0x3ffc199bd0000000 */
492     1.79470902681350708008e+00,   /* 0x3ffcb720d0000000 */
493     1.83400803804397583008e+00,   /* 0x3ffd5818d0000000 */
494     1.87416762113571166992e+00,   /* 0x3ffdfc9730000000 */
495     1.91520655155181884766e+00,   /* 0x3ffea4afa0000000 */
496     1.95714408159255981445e+00};  /* 0x3fff507650000000 */
497 
498   static const double two_to_jby32_trail_table[32] = {
499     0.00000000000000000000e+00,   /* 0x0000000000000000 */
500     1.14890470981563546737e-08,   /* 0x3e48ac2ba1d73e2a */
501     4.83347014379782142328e-08,   /* 0x3e69f3121ec53172 */
502     2.67125131841396124714e-10,   /* 0x3df25b50a4ebbf1b */
503     4.65271045830351350190e-08,   /* 0x3e68faa2f5b9bef9 */
504     5.24924336638693782574e-09,   /* 0x3e368b9aa7805b80 */
505     5.38622214388600821910e-08,   /* 0x3e6ceac470cd83f6 */
506     1.90902301017041969782e-08,   /* 0x3e547f7b84b09745 */
507     3.79763538792174980894e-08,   /* 0x3e64636e2a5bd1ab */
508     2.69306947081946450986e-08,   /* 0x3e5ceaa72a9c5154 */
509     4.49683815095311756138e-08,   /* 0x3e682468446b6824 */
510     1.41933332021066904914e-09,   /* 0x3e18624b40c4dbd0 */
511     1.94146510233556266402e-08,   /* 0x3e54d8a89c750e5e */
512     2.46409119489264118569e-08,   /* 0x3e5a753e077c2a0f */
513     4.94812958044698886494e-08,   /* 0x3e6a90a852b19260 */
514     8.48872238075784476136e-10,   /* 0x3e0d2ac258f87d03 */
515     2.42032342089579394887e-08,   /* 0x3e59fcef32422cbf */
516     3.32420002333182569170e-08,   /* 0x3e61d8bee7ba46e2 */
517     1.45956577586525322754e-08,   /* 0x3e4f580c36bea881 */
518     3.46452721050003920866e-08,   /* 0x3e62999c25159f11 */
519     8.07090469079979051284e-09,   /* 0x3e415506dadd3e2a */
520     2.99439161340839520436e-09,   /* 0x3e29b8bc9e8a0388 */
521     9.83621719880452147153e-09,   /* 0x3e451f8480e3e236 */
522     8.35492309647188080486e-09,   /* 0x3e41f12ae45a1224 */
523     3.48493175137966283582e-08,   /* 0x3e62b5a75abd0e6a */
524     1.11084703472699692902e-08,   /* 0x3e47daf237553d84 */
525     5.03688744342840346564e-08,   /* 0x3e6b0aa538444196 */
526     4.81896001063495806249e-08,   /* 0x3e69df20d22a0798 */
527     4.83653666334089557746e-08,   /* 0x3e69f7490e4bb40b */
528     1.29745882314081237628e-08,   /* 0x3e4bdcdaf5cb4656 */
529     9.84532844621636118964e-09,   /* 0x3e452486cc2c7b9d */
530     4.25828404545651943883e-08};  /* 0x3e66dc8a80ce9f09 */
531 
532     /*
533       Step 1. Reduce the argument.
534 
535       To perform argument reduction, we find the integer n such that
536       x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64.
537       n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and
538       remainder by x - n*logbaseof2/32. The calculation of n is
539       straightforward whereas the computation of x - n*logbaseof2/32
540       must be carried out carefully.
541       logbaseof2/32 is so represented in two pieces that
542       (1) logbaseof2/32 is known to extra precision, (2) the product
543       of n and the leading piece is a model number and is hence
544       calculated without error, and (3) the subtraction of the value
545       obtained in (2) from x is a model number and is hence again
546       obtained without error.
547     */
548 
549     r = x * thirtytwo_by_logbaseof2;
550     /* Set n = nearest integer to r */
551     /* This is faster on Hammer */
552     if (r > 0)
553       n = (int)(r + 0.5);
554     else
555       n = (int)(r - 0.5);
556 
557     r1 = x - n * logbaseof2_by_32_lead;
558     r2 =   - n * logbaseof2_by_32_trail;
559 
560     /* Set j = n mod 32:   5 mod 32 = 5,   -5 mod 32 = 27,  etc. */
561     /* j = n % 32;
562        if (j < 0) j += 32; */
563     j = n & 0x0000001f;
564 
565     f1 = two_to_jby32_lead_table[j];
566     f2 = two_to_jby32_trail_table[j];
567 
568     *m = (n - j) / 32;
569 
570     /* Step 2. The following is the core approximation. We approximate
571        exp(r1+r2)-1 by a polynomial. */
572 
573     r1 *= logbase; r2 *= logbase;
574 
575     r = r1 + r2;
576     q = r1 + (r2 +
577               r*r*( 5.00000000000000008883e-01 +
578                       r*( 1.66666666665260878863e-01 +
579                       r*( 4.16666666662260795726e-02 +
580                       r*( 8.33336798434219616221e-03 +
581                       r*( 1.38889490863777199667e-03 ))))));
582 
583     /* Step 3. Function value reconstruction.
584        We now reconstruct the exponential of the input argument
585        so that exp(x) = 2**m * (z1 + z2).
586        The order of the computation below must be strictly observed. */
587 
588     *z1 = f1;
589     *z2 = f2 + ((f1 + f2) * q);
590 }
591 #endif /* USE_SPLITEXP */
592 
593 
594 #if defined(USE_SPLITEXPF)
595 /* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2).
596    Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments
597    abs(x) > large/(ln(base)) (where large is the largest representable
598    floating point number) should be handled separately instead of calling
599    this function. This function is called by exp, exp2, exp10,
600    cosh and sinh. */
splitexpf(float x,float logbase,float thirtytwo_by_logbaseof2,float logbaseof2_by_32_lead,float logbaseof2_by_32_trail,int * m,float * z1,float * z2)601 static inline void splitexpf(float x, float logbase,
602                              float thirtytwo_by_logbaseof2,
603                              float logbaseof2_by_32_lead,
604                              float logbaseof2_by_32_trail,
605                              int *m, float *z1, float *z2)
606 {
607   float q, r, r1, r2, f1, f2;
608   int n, j;
609 
610 /* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain
611    leading and trailing parts respectively of precomputed
612    values of pow(2.0,j/32.0), for j = 0, 1, ..., 31.
613    two_to_jby32_lead_table contains the first 10 bits of precision,
614    and two_to_jby32_trail_table contains a further 24 bits precision. */
615 
616   static const float two_to_jby32_lead_table[32] = {
617     1.0000000000E+00F,  /* 0x3F800000 */
618     1.0214843750E+00F,  /* 0x3F82C000 */
619     1.0429687500E+00F,  /* 0x3F858000 */
620     1.0664062500E+00F,  /* 0x3F888000 */
621     1.0898437500E+00F,  /* 0x3F8B8000 */
622     1.1132812500E+00F,  /* 0x3F8E8000 */
623     1.1386718750E+00F,  /* 0x3F91C000 */
624     1.1621093750E+00F,  /* 0x3F94C000 */
625     1.1875000000E+00F,  /* 0x3F980000 */
626     1.2148437500E+00F,  /* 0x3F9B8000 */
627     1.2402343750E+00F,  /* 0x3F9EC000 */
628     1.2675781250E+00F,  /* 0x3FA24000 */
629     1.2949218750E+00F,  /* 0x3FA5C000 */
630     1.3242187500E+00F,  /* 0x3FA98000 */
631     1.3535156250E+00F,  /* 0x3FAD4000 */
632     1.3828125000E+00F,  /* 0x3FB10000 */
633     1.4140625000E+00F,  /* 0x3FB50000 */
634     1.4433593750E+00F,  /* 0x3FB8C000 */
635     1.4765625000E+00F,  /* 0x3FBD0000 */
636     1.5078125000E+00F,  /* 0x3FC10000 */
637     1.5410156250E+00F,  /* 0x3FC54000 */
638     1.5742187500E+00F,  /* 0x3FC98000 */
639     1.6093750000E+00F,  /* 0x3FCE0000 */
640     1.6445312500E+00F,  /* 0x3FD28000 */
641     1.6816406250E+00F,  /* 0x3FD74000 */
642     1.7167968750E+00F,  /* 0x3FDBC000 */
643     1.7558593750E+00F,  /* 0x3FE0C000 */
644     1.7929687500E+00F,  /* 0x3FE58000 */
645     1.8339843750E+00F,  /* 0x3FEAC000 */
646     1.8730468750E+00F,  /* 0x3FEFC000 */
647     1.9140625000E+00F,  /* 0x3FF50000 */
648     1.9570312500E+00F}; /* 0x3FFA8000 */
649 
650   static const float two_to_jby32_trail_table[32] = {
651     0.0000000000E+00F,  /* 0x00000000 */
652     4.1277357377E-04F,  /* 0x39D86988 */
653     1.3050324051E-03F,  /* 0x3AAB0D9F */
654     7.3415064253E-04F,  /* 0x3A407404 */
655     6.6398258787E-04F,  /* 0x3A2E0F1E */
656     1.1054925853E-03F,  /* 0x3A90E62D */
657     1.1675967835E-04F,  /* 0x38F4DCE0 */
658     1.6154836630E-03F,  /* 0x3AD3BEA3 */
659     1.7071149778E-03F,  /* 0x3ADFC146 */
660     4.0360994171E-04F,  /* 0x39D39B9C */
661     1.6234370414E-03F,  /* 0x3AD4C982 */
662     1.4728321694E-03F,  /* 0x3AC10C0C */
663     1.9176795613E-03F,  /* 0x3AFB5AA6 */
664     1.0178930825E-03F,  /* 0x3A856AD3 */
665     7.3992193211E-04F,  /* 0x3A41F752 */
666     1.0973819299E-03F,  /* 0x3A8FD607 */
667     1.5106226783E-04F,  /* 0x391E6678 */
668     1.8214319134E-03F,  /* 0x3AEEBD1D */
669     2.6364589576E-04F,  /* 0x398A39F4 */
670     1.3519275235E-03F,  /* 0x3AB13329 */
671     1.1952003697E-03F,  /* 0x3A9CA845 */
672     1.7620950239E-03F,  /* 0x3AE6F619 */
673     1.1153318919E-03F,  /* 0x3A923054 */
674     1.2242280645E-03F,  /* 0x3AA07647 */
675     1.5220546629E-04F,  /* 0x391F9958 */
676     1.8224230735E-03F,  /* 0x3AEEDE5F */
677     3.9278529584E-04F,  /* 0x39CDEEC0 */
678     1.7403248930E-03F,  /* 0x3AE41B9D */
679     2.3711356334E-05F,  /* 0x37C6E7C0 */
680     1.1207590578E-03F,  /* 0x3A92E66F */
681     1.1440613307E-03F,  /* 0x3A95F454 */
682     1.1287408415E-04F}; /* 0x38ECB6D0 */
683 
684     /*
685       Step 1. Reduce the argument.
686 
687       To perform argument reduction, we find the integer n such that
688       x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64.
689       n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and
690       remainder by x - n*logbaseof2/32. The calculation of n is
691       straightforward whereas the computation of x - n*logbaseof2/32
692       must be carried out carefully.
693       logbaseof2/32 is so represented in two pieces that
694       (1) logbaseof2/32 is known to extra precision, (2) the product
695       of n and the leading piece is a model number and is hence
696       calculated without error, and (3) the subtraction of the value
697       obtained in (2) from x is a model number and is hence again
698       obtained without error.
699     */
700 
701     r = x * thirtytwo_by_logbaseof2;
702     /* Set n = nearest integer to r */
703     /* This is faster on Hammer */
704     if (r > 0)
705       n = (int)(r + 0.5F);
706     else
707       n = (int)(r - 0.5F);
708 
709     r1 = x - n * logbaseof2_by_32_lead;
710     r2 =   - n * logbaseof2_by_32_trail;
711 
712     /* Set j = n mod 32:   5 mod 32 = 5,   -5 mod 32 = 27,  etc. */
713     /* j = n % 32;
714        if (j < 0) j += 32; */
715     j = n & 0x0000001f;
716 
717     f1 = two_to_jby32_lead_table[j];
718     f2 = two_to_jby32_trail_table[j];
719 
720     *m = (n - j) / 32;
721 
722     /* Step 2. The following is the core approximation. We approximate
723        exp(r1+r2)-1 by a polynomial. */
724 
725     r1 *= logbase; r2 *= logbase;
726 
727     r = r1 + r2;
728     q = r1 + (r2 +
729               r*r*( 5.00000000000000008883e-01F +
730                       r*( 1.66666666665260878863e-01F )));
731 
732     /* Step 3. Function value reconstruction.
733        We now reconstruct the exponential of the input argument
734        so that exp(x) = 2**m * (z1 + z2).
735        The order of the computation below must be strictly observed. */
736 
737     *z1 = f1;
738     *z2 = f2 + ((f1 + f2) * q);
739 }
740 #endif /* SPLITEXPF */
741 
742 
743 #if defined(USE_SCALEUPDOUBLE1024)
744 /* Scales up a double (normal or denormal) whose bit pattern is given
745    as ux by 2**1024. There are no checks that the input number is
746    scalable by that amount. */
scaleUpDouble1024(unsigned long long ux,unsigned long long * ur)747 static inline void scaleUpDouble1024(unsigned long long ux, unsigned long long *ur)
748 {
749   unsigned long long uy;
750   double y;
751 
752   if ((ux & EXPBITS_DP64) == 0)
753     {
754       /* ux is denormalised */
755       PUT_BITS_DP64(ux | 0x4010000000000000, y);
756       if (ux & SIGNBIT_DP64)
757         y += 4.0;
758       else
759         y -= 4.0;
760       GET_BITS_DP64(y, uy);
761     }
762   else
763     /* ux is normal */
764     uy = ux + 0x4000000000000000;
765 
766   *ur = uy;
767   return;
768 }
769 
770 #endif /* SCALEUPDOUBLE1024 */
771 
772 
773 #if defined(USE_SCALEDOWNDOUBLE)
774 /* Scales down a double whose bit pattern is given as ux by 2**k.
775    There are no checks that the input number is scalable by that amount. */
scaleDownDouble(unsigned long long ux,int k,unsigned long long * ur)776 static inline void scaleDownDouble(unsigned long long ux, int k,
777                                    unsigned long long *ur)
778 {
779   unsigned long long uy, uk, ax, xsign;
780   int n, shift;
781   xsign = ux & SIGNBIT_DP64;
782   ax = ux & ~SIGNBIT_DP64;
783   n = (int)((ax & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - k;
784   if (n > 0)
785     {
786       uk = (unsigned long long)n << EXPSHIFTBITS_DP64;
787       uy = (ax & ~EXPBITS_DP64) | uk;
788     }
789   else
790     {
791       uy = (ax & ~EXPBITS_DP64) | 0x0010000000000000;
792       shift = (1 - n);
793       if (shift > MANTLENGTH_DP64 + 1)
794         /* Sigh. Shifting works mod 64 so be careful not to shift too much */
795         uy = 0;
796       else
797         {
798           /* Make sure we round the result */
799           uy >>= shift - 1;
800           uy = (uy >> 1) + (uy & 1);
801         }
802     }
803   *ur = uy | xsign;
804 }
805 
806 #endif /* SCALEDOWNDOUBLE */
807 
808 
809 #if defined(USE_SCALEUPFLOAT128)
810 /* Scales up a float (normal or denormal) whose bit pattern is given
811    as ux by 2**128. There are no checks that the input number is
812    scalable by that amount. */
scaleUpFloat128(unsigned int ux,unsigned int * ur)813 static inline void scaleUpFloat128(unsigned int ux, unsigned int *ur)
814 {
815   unsigned int uy;
816   float y;
817 
818   if ((ux & EXPBITS_SP32) == 0)
819     {
820       /* ux is denormalised */
821       PUT_BITS_SP32(ux | 0x40800000, y);
822       /* Compensate for the implicit bit just added */
823       if (ux & SIGNBIT_SP32)
824         y += 4.0F;
825       else
826         y -= 4.0F;
827       GET_BITS_SP32(y, uy);
828     }
829   else
830     /* ux is normal */
831     uy = ux + 0x40000000;
832   *ur = uy;
833 }
834 #endif /* SCALEUPFLOAT128 */
835 
836 
837 #if defined(USE_SCALEDOWNFLOAT)
838 /* Scales down a float whose bit pattern is given as ux by 2**k.
839    There are no checks that the input number is scalable by that amount. */
scaleDownFloat(unsigned int ux,int k,unsigned int * ur)840 static inline void scaleDownFloat(unsigned int ux, int k,
841                                   unsigned int *ur)
842 {
843   unsigned int uy, uk, ax, xsign;
844   int n, shift;
845 
846   xsign = ux & SIGNBIT_SP32;
847   ax = ux & ~SIGNBIT_SP32;
848   n = ((ax & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - k;
849   if (n > 0)
850     {
851       uk = (unsigned int)n << EXPSHIFTBITS_SP32;
852       uy = (ax & ~EXPBITS_SP32) | uk;
853     }
854   else
855     {
856       uy = (ax & ~EXPBITS_SP32) | 0x00800000;
857       shift = (1 - n);
858       if (shift > MANTLENGTH_SP32 + 1)
859         /* Sigh. Shifting works mod 32 so be careful not to shift too much */
860         uy = 0;
861       else
862         {
863           /* Make sure we round the result */
864           uy >>= shift - 1;
865           uy = (uy >> 1) + (uy & 1);
866         }
867     }
868   *ur = uy | xsign;
869 }
870 #endif /* SCALEDOWNFLOAT */
871 
872 
873 #if defined(USE_SQRT_AMD_INLINE)
sqrt_amd_inline(double x)874 static inline double sqrt_amd_inline(double x)
875 {
876   /*
877      Computes the square root of x.
878 
879      The calculation is carried out in three steps.
880 
881      Step 1. Reduction.
882      The input argument is scaled to the interval [1, 4) by
883      computing
884                x = 2^e * y, where y in [1,4).
885      Furthermore y is decomposed as y = c + t where
886                c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64.
887 
888      Step 2. Approximation.
889      An approximation q = sqrt(1 + (t/c)) - 1  is obtained
890      from a basic series expansion using precomputed values
891      stored in rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl.
892 
893      Step 3. Reconstruction.
894      The value of sqrt(x) is reconstructed via
895        sqrt(x) = 2^(e/2) * sqrt(y)
896                = 2^(e/2) * sqrt(c) * sqrt(y/c)
897                = 2^(e/2) * sqrt(c) * sqrt(1 + t/c)
898                = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ]
899     */
900 
901   unsigned long long ux, ax, u;
902   double r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail;
903   int e, denorm = 0, index;
904 
905 /* Arrays rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl contain
906    leading and trailing parts respectively of precomputed
907    values of sqrt(j/32), for j = 32, 33, ..., 128.
908    rt_jby32_lead_table_dbl contains the first 21 bits of precision,
909    and rt_jby32_trail_table_dbl contains a further 53 bits precision. */
910 
911   static const double rt_jby32_lead_table_dbl[97] = {
912     1.00000000000000000000e+00,   /* 0x3ff0000000000000 */
913     1.01550388336181640625e+00,   /* 0x3ff03f8100000000 */
914     1.03077602386474609375e+00,   /* 0x3ff07e0f00000000 */
915     1.04582500457763671875e+00,   /* 0x3ff0bbb300000000 */
916     1.06065940856933593750e+00,   /* 0x3ff0f87600000000 */
917     1.07528972625732421875e+00,   /* 0x3ff1346300000000 */
918     1.08972454071044921875e+00,   /* 0x3ff16f8300000000 */
919     1.10396957397460937500e+00,   /* 0x3ff1a9dc00000000 */
920     1.11803340911865234375e+00,   /* 0x3ff1e37700000000 */
921     1.13192272186279296875e+00,   /* 0x3ff21c5b00000000 */
922     1.14564323425292968750e+00,   /* 0x3ff2548e00000000 */
923     1.15920162200927734375e+00,   /* 0x3ff28c1700000000 */
924     1.17260360717773437500e+00,   /* 0x3ff2c2fc00000000 */
925     1.18585395812988281250e+00,   /* 0x3ff2f94200000000 */
926     1.19895744323730468750e+00,   /* 0x3ff32eee00000000 */
927     1.21191978454589843750e+00,   /* 0x3ff3640600000000 */
928     1.22474479675292968750e+00,   /* 0x3ff3988e00000000 */
929     1.23743629455566406250e+00,   /* 0x3ff3cc8a00000000 */
930     1.25000000000000000000e+00,   /* 0x3ff4000000000000 */
931     1.26243782043457031250e+00,   /* 0x3ff432f200000000 */
932     1.27475452423095703125e+00,   /* 0x3ff4656500000000 */
933     1.28695297241210937500e+00,   /* 0x3ff4975c00000000 */
934     1.29903793334960937500e+00,   /* 0x3ff4c8dc00000000 */
935     1.31101036071777343750e+00,   /* 0x3ff4f9e600000000 */
936     1.32287502288818359375e+00,   /* 0x3ff52a7f00000000 */
937     1.33463478088378906250e+00,   /* 0x3ff55aaa00000000 */
938     1.34629058837890625000e+00,   /* 0x3ff58a6800000000 */
939     1.35784721374511718750e+00,   /* 0x3ff5b9be00000000 */
940     1.36930561065673828125e+00,   /* 0x3ff5e8ad00000000 */
941     1.38066959381103515625e+00,   /* 0x3ff6173900000000 */
942     1.39194107055664062500e+00,   /* 0x3ff6456400000000 */
943     1.40312099456787109375e+00,   /* 0x3ff6732f00000000 */
944     1.41421318054199218750e+00,   /* 0x3ff6a09e00000000 */
945     1.42521858215332031250e+00,   /* 0x3ff6cdb200000000 */
946     1.43614006042480468750e+00,   /* 0x3ff6fa6e00000000 */
947     1.44697952270507812500e+00,   /* 0x3ff726d400000000 */
948     1.45773792266845703125e+00,   /* 0x3ff752e500000000 */
949     1.46841716766357421875e+00,   /* 0x3ff77ea300000000 */
950     1.47901916503906250000e+00,   /* 0x3ff7aa1000000000 */
951     1.48954677581787109375e+00,   /* 0x3ff7d52f00000000 */
952     1.50000000000000000000e+00,   /* 0x3ff8000000000000 */
953     1.51038074493408203125e+00,   /* 0x3ff82a8500000000 */
954     1.52068996429443359375e+00,   /* 0x3ff854bf00000000 */
955     1.53093051910400390625e+00,   /* 0x3ff87eb100000000 */
956     1.54110336303710937500e+00,   /* 0x3ff8a85c00000000 */
957     1.55120849609375000000e+00,   /* 0x3ff8d1c000000000 */
958     1.56124877929687500000e+00,   /* 0x3ff8fae000000000 */
959     1.57122516632080078125e+00,   /* 0x3ff923bd00000000 */
960     1.58113861083984375000e+00,   /* 0x3ff94c5800000000 */
961     1.59099006652832031250e+00,   /* 0x3ff974b200000000 */
962     1.60078048706054687500e+00,   /* 0x3ff99ccc00000000 */
963     1.61051177978515625000e+00,   /* 0x3ff9c4a800000000 */
964     1.62018489837646484375e+00,   /* 0x3ff9ec4700000000 */
965     1.62979984283447265625e+00,   /* 0x3ffa13a900000000 */
966     1.63935947418212890625e+00,   /* 0x3ffa3ad100000000 */
967     1.64886283874511718750e+00,   /* 0x3ffa61be00000000 */
968     1.65831184387207031250e+00,   /* 0x3ffa887200000000 */
969     1.66770744323730468750e+00,   /* 0x3ffaaeee00000000 */
970     1.67705059051513671875e+00,   /* 0x3ffad53300000000 */
971     1.68634128570556640625e+00,   /* 0x3ffafb4100000000 */
972     1.69558238983154296875e+00,   /* 0x3ffb211b00000000 */
973     1.70477199554443359375e+00,   /* 0x3ffb46bf00000000 */
974     1.71391296386718750000e+00,   /* 0x3ffb6c3000000000 */
975     1.72300529479980468750e+00,   /* 0x3ffb916e00000000 */
976     1.73204994201660156250e+00,   /* 0x3ffbb67a00000000 */
977     1.74104785919189453125e+00,   /* 0x3ffbdb5500000000 */
978     1.75000000000000000000e+00,   /* 0x3ffc000000000000 */
979     1.75890541076660156250e+00,   /* 0x3ffc247a00000000 */
980     1.76776695251464843750e+00,   /* 0x3ffc48c600000000 */
981     1.77658367156982421875e+00,   /* 0x3ffc6ce300000000 */
982     1.78535652160644531250e+00,   /* 0x3ffc90d200000000 */
983     1.79408740997314453125e+00,   /* 0x3ffcb49500000000 */
984     1.80277538299560546875e+00,   /* 0x3ffcd82b00000000 */
985     1.81142139434814453125e+00,   /* 0x3ffcfb9500000000 */
986     1.82002735137939453125e+00,   /* 0x3ffd1ed500000000 */
987     1.82859230041503906250e+00,   /* 0x3ffd41ea00000000 */
988     1.83711719512939453125e+00,   /* 0x3ffd64d500000000 */
989     1.84560203552246093750e+00,   /* 0x3ffd879600000000 */
990     1.85404872894287109375e+00,   /* 0x3ffdaa2f00000000 */
991     1.86245727539062500000e+00,   /* 0x3ffdcca000000000 */
992     1.87082862854003906250e+00,   /* 0x3ffdeeea00000000 */
993     1.87916183471679687500e+00,   /* 0x3ffe110c00000000 */
994     1.88745784759521484375e+00,   /* 0x3ffe330700000000 */
995     1.89571857452392578125e+00,   /* 0x3ffe54dd00000000 */
996     1.90394306182861328125e+00,   /* 0x3ffe768d00000000 */
997     1.91213226318359375000e+00,   /* 0x3ffe981800000000 */
998     1.92028617858886718750e+00,   /* 0x3ffeb97e00000000 */
999     1.92840576171875000000e+00,   /* 0x3ffedac000000000 */
1000     1.93649101257324218750e+00,   /* 0x3ffefbde00000000 */
1001     1.94454288482666015625e+00,   /* 0x3fff1cd900000000 */
1002     1.95256233215332031250e+00,   /* 0x3fff3db200000000 */
1003     1.96054744720458984375e+00,   /* 0x3fff5e6700000000 */
1004     1.96850109100341796875e+00,   /* 0x3fff7efb00000000 */
1005     1.97642326354980468750e+00,   /* 0x3fff9f6e00000000 */
1006     1.98431301116943359375e+00,   /* 0x3fffbfbf00000000 */
1007     1.99217128753662109375e+00,   /* 0x3fffdfef00000000 */
1008     2.00000000000000000000e+00};  /* 0x4000000000000000 */
1009 
1010   static const double rt_jby32_trail_table_dbl[97] = {
1011     0.00000000000000000000e+00,   /* 0x0000000000000000 */
1012     9.17217678638807524014e-07,   /* 0x3eaec6d70177881c */
1013     3.82539669043705364790e-07,   /* 0x3e99abfb41bd6b24 */
1014     2.85899577162227138140e-08,   /* 0x3e5eb2bf6bab55a2 */
1015     7.63210485349101216659e-07,   /* 0x3ea99bed9b2d8d0c */
1016     9.32123004127716212874e-07,   /* 0x3eaf46e029c1b296 */
1017     1.95174719169309219157e-07,   /* 0x3e8a3226fc42f30c */
1018     5.34316371481845492427e-07,   /* 0x3ea1edbe20701d73 */
1019     5.79631242504454563052e-07,   /* 0x3ea372fe94f82be7 */
1020     4.20404384109571705948e-07,   /* 0x3e9c367e08e7bb06 */
1021     6.89486030314147010716e-07,   /* 0x3ea722a3d0a66608 */
1022     6.89927685625314560328e-07,   /* 0x3ea7266f067ca1d6 */
1023     3.32778123013641425828e-07,   /* 0x3e965515a9b34850 */
1024     1.64433259436999584387e-07,   /* 0x3e8611e23ef6c1bd */
1025     4.37590875197899335723e-07,   /* 0x3e9d5dc1059ed8e7 */
1026     1.79808183816018617413e-07,   /* 0x3e88222982d0e4f4 */
1027     7.46386593615986477624e-08,   /* 0x3e7409212e7d0322 */
1028     5.72520794105201454728e-07,   /* 0x3ea335ea8a5fcf39 */
1029     0.00000000000000000000e+00,   /* 0x0000000000000000 */
1030     2.96860689431670420344e-07,   /* 0x3e93ec071e938bfe */
1031     3.54167239176257065345e-07,   /* 0x3e97c48bfd9862c6 */
1032     7.95211265664474710063e-07,   /* 0x3eaaaed010f74671 */
1033     1.72327048595145565621e-07,   /* 0x3e87211cbfeb62e0 */
1034     6.99494915996239297020e-07,   /* 0x3ea7789d9660e72d */
1035     6.32644111701500844315e-07,   /* 0x3ea53a5f1d36f1cf */
1036     6.20124838851440463844e-10,   /* 0x3e054eacff2057dc */
1037     6.13404719757812629969e-07,   /* 0x3ea4951b3e6a83cc */
1038     3.47654909777986407387e-07,   /* 0x3e9754aa76884c66 */
1039     7.83106177002392475763e-07,   /* 0x3eaa46d4b1de1074 */
1040     5.33337372440526357008e-07,   /* 0x3ea1e55548f92635 */
1041     2.01508648555298681765e-08,   /* 0x3e55a3070dd17788 */
1042     5.25472356925843939587e-07,   /* 0x3ea1a1c5eedb0801 */
1043     3.81831102861301692797e-07,   /* 0x3e999fcef32422cc */
1044     6.99220602161420018738e-07,   /* 0x3ea776425d6b0199 */
1045     6.01209702477462624811e-07,   /* 0x3ea42c5a1e0191a2 */
1046     9.01437000591944740554e-08,   /* 0x3e7832a0bdff1327 */
1047     5.10428680864685379950e-08,   /* 0x3e6b674743636676 */
1048     3.47895267104621031421e-07,   /* 0x3e9758cb90d2f714 */
1049     7.80735841510641848628e-07,   /* 0x3eaa3278459cde25 */
1050     1.35158752025506517690e-07,   /* 0x3e822404f4a103ee */
1051     0.00000000000000000000e+00,   /* 0x0000000000000000 */
1052     1.76523947728535489812e-09,   /* 0x3e1e539af6892ac5 */
1053     6.68280121328499932183e-07,   /* 0x3ea66c7b872c9cd0 */
1054     5.70135482405123276616e-07,   /* 0x3ea3216d2f43887d */
1055     1.37705134737562525897e-07,   /* 0x3e827b832cbedc0e */
1056     7.09655107074516613672e-07,   /* 0x3ea7cfe41579091d */
1057     7.20302724551461693011e-07,   /* 0x3ea82b5a713c490a */
1058     4.69926266058212796694e-07,   /* 0x3e9f8945932d872e */
1059     2.19244345915999437026e-07,   /* 0x3e8d6d2da9490251 */
1060     1.91141411617401877927e-07,   /* 0x3e89a791a3114e4a */
1061     5.72297665296622053774e-07,   /* 0x3ea333ffe005988d */
1062     5.61055484436830560103e-07,   /* 0x3ea2d36e0ed49ab1 */
1063     2.76225500213991506100e-07,   /* 0x3e92898498f55f9e */
1064     7.58466189522395692908e-07,   /* 0x3ea9732cca1032a3 */
1065     1.56893371256836029827e-07,   /* 0x3e850ed0b02a22d2 */
1066     4.06038997708867066507e-07,   /* 0x3e9b3fb265b1e40a */
1067     5.51305629612057435809e-07,   /* 0x3ea27fade682d1de */
1068     5.64778487026561123207e-07,   /* 0x3ea2f36906f707ba */
1069     3.92609705553556897517e-07,   /* 0x3e9a58fbbee883b6 */
1070     9.09698438776943827802e-07,   /* 0x3eae864005bca6d7 */
1071     1.05949774066016139743e-07,   /* 0x3e7c70d02300f263 */
1072     7.16578798392844784244e-07,   /* 0x3ea80b5d712d8e3e */
1073     6.86233073531233972561e-07,   /* 0x3ea706b27cc7d390 */
1074     7.99211473033494452908e-07,   /* 0x3eaad12c9d849a97 */
1075     8.65552275731027456121e-07,   /* 0x3ead0b09954e764b */
1076     6.75456120386058448618e-07,   /* 0x3ea6aa1fb7826cbd */
1077     0.00000000000000000000e+00,   /* 0x0000000000000000 */
1078     4.99167184520462138743e-07,   /* 0x3ea0bfd03f46763c */
1079     4.51720373502110930296e-10,   /* 0x3dff0abfb4adfb9e */
1080     1.28874162718371367439e-07,   /* 0x3e814c151f991b2e */
1081     5.85529267186999798656e-07,   /* 0x3ea3a5a879b09292 */
1082     1.01827770937125531924e-07,   /* 0x3e7b558d173f9796 */
1083     2.54736389177809626508e-07,   /* 0x3e9118567cd83fb8 */
1084     6.98925535290464831294e-07,   /* 0x3ea773b981896751 */
1085     1.20940735036524314513e-07,   /* 0x3e803b7df49f48a8 */
1086     5.43759351196479689657e-08,   /* 0x3e6d315f22491900 */
1087     1.11957989042397958409e-07,   /* 0x3e7e0db1c5bb84b2 */
1088     8.47006714134442661218e-07,   /* 0x3eac6bbb7644ff76 */
1089     8.92831044643427836228e-07,   /* 0x3eadf55c3afec01f */
1090     7.77828292464916501663e-07,   /* 0x3eaa197e81034da3 */
1091     6.48469316302918797451e-08,   /* 0x3e71683f4920555d */
1092     2.12579816658859849140e-07,   /* 0x3e8c882fd78bb0b0 */
1093     7.61222472580559138435e-07,   /* 0x3ea98ad9eb7b83ec */
1094     2.86488961857314189607e-07,   /* 0x3e9339d7c7777273 */
1095     2.14637363790165363515e-07,   /* 0x3e8ccee237cae6fe */
1096     5.44137005612605847831e-08,   /* 0x3e6d368fe324a146 */
1097     2.58378284856442408413e-07,   /* 0x3e9156e7b6d99b45 */
1098     3.15848939061134843091e-07,   /* 0x3e95323e5310b5c1 */
1099     6.60530466255089632309e-07,   /* 0x3ea629e9db362f5d */
1100     7.63436345535852301127e-07,   /* 0x3ea99dde4728d7ec */
1101     8.68233432860324345268e-08,   /* 0x3e774e746878544d */
1102     9.45465175398023087082e-07,   /* 0x3eafb97be873a87d */
1103     8.77499534786171267246e-07,   /* 0x3ead71a9e23c2f63 */
1104     2.74055432394999316135e-07,   /* 0x3e92643c89cda173 */
1105     4.72129009349126213532e-07,   /* 0x3e9faf1d57a4d56c */
1106     8.93777032327078947306e-07,   /* 0x3eadfd7c7ab7b282 */
1107     0.00000000000000000000e+00};  /* 0x0000000000000000 */
1108 
1109 
1110   /* Handle special arguments first */
1111 
1112   GET_BITS_DP64(x, ux);
1113   ax = ux & (~SIGNBIT_DP64);
1114 
1115   if(ax >= 0x7ff0000000000000)
1116     {
1117       /* x is either NaN or infinity */
1118       if (ux & MANTBITS_DP64)
1119         /* x is NaN */
1120         return x + x; /* Raise invalid if it is a signalling NaN */
1121       else if (ux & SIGNBIT_DP64)
1122         /* x is negative infinity */
1123         return nan_with_flags(AMD_F_INVALID);
1124       else
1125         /* x is positive infinity */
1126         return x;
1127     }
1128   else if (ux & SIGNBIT_DP64)
1129     {
1130       /* x is negative. */
1131       if (ux == SIGNBIT_DP64)
1132         /* Handle negative zero first */
1133         return x;
1134       else
1135         return nan_with_flags(AMD_F_INVALID);
1136     }
1137   else if (ux <= 0x000fffffffffffff)
1138     {
1139       /* x is denormalised or zero */
1140       if (ux == 0)
1141         /* x is zero */
1142         return x;
1143       else
1144         {
1145           /* x is denormalised; scale it up */
1146           /* Normalize x by increasing the exponent by 60
1147              and subtracting a correction to account for the implicit
1148              bit. This replaces a slow denormalized
1149              multiplication by a fast normal subtraction. */
1150           static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */
1151           denorm = 1;
1152           GET_BITS_DP64(x, ux);
1153           PUT_BITS_DP64(ux | 0x03d0000000000000, x);
1154           x -= corr;
1155           GET_BITS_DP64(x, ux);
1156         }
1157     }
1158 
1159   /* Main algorithm */
1160 
1161   /*
1162      Find y and e such that x = 2^e * y, where y in [1,4).
1163      This is done using an in-lined variant of splitDouble,
1164      which also ensures that e is even.
1165    */
1166   y = x;
1167   ux &= EXPBITS_DP64;
1168   ux >>= EXPSHIFTBITS_DP64;
1169   if (ux & 1)
1170     {
1171       GET_BITS_DP64(y, u);
1172       u &= (SIGNBIT_DP64 | MANTBITS_DP64);
1173       u |= ONEEXPBITS_DP64;
1174       PUT_BITS_DP64(u, y);
1175       e = ux - EXPBIAS_DP64;
1176     }
1177   else
1178     {
1179       GET_BITS_DP64(y, u);
1180       u &= (SIGNBIT_DP64 | MANTBITS_DP64);
1181       u |= TWOEXPBITS_DP64;
1182       PUT_BITS_DP64(u, y);
1183       e = ux - EXPBIAS_DP64 - 1;
1184     }
1185 
1186 
1187   /* Find the index of the sub-interval of [1,4) in which y lies. */
1188 
1189   index = (int)(32.0*y+0.5);
1190 
1191   /* Look up the table values and compute c and r = c/t */
1192 
1193   rtc_lead = rt_jby32_lead_table_dbl[index-32];
1194   rtc_trail = rt_jby32_trail_table_dbl[index-32];
1195   c = 0.03125*index;
1196   r = (y - c)/c;
1197 
1198   /*
1199     Find q = sqrt(1+r) - 1.
1200     From one step of Newton on (q+1)^2 = 1+r
1201   */
1202 
1203   p = r*0.5 - r*r*(0.1250079870 - r*(0.6250522999E-01));
1204   twop = p + p;
1205   q = p - (p*p + (twop - r))/(twop + 2.0);
1206 
1207   /* Reconstruction */
1208 
1209   rtc = rtc_lead + rtc_trail;
1210   e >>= 1; /* e = e/2 */
1211   z = rtc_lead + (rtc*q+rtc_trail);
1212 
1213   if (denorm)
1214     {
1215       /* Scale by 2**(e-30) */
1216       PUT_BITS_DP64(((long long)(e - 30) + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r);
1217       z *= r;
1218     }
1219   else
1220     {
1221       /* Scale by 2**e */
1222       PUT_BITS_DP64(((long long)e + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r);
1223       z *= r;
1224     }
1225 
1226   return z;
1227 
1228 }
1229 #endif /* SQRT_AMD_INLINE */
1230 
1231 #if defined(USE_SQRTF_AMD_INLINE)
1232 
sqrtf_amd_inline(float x)1233 static inline float sqrtf_amd_inline(float x)
1234 {
1235   /*
1236      Computes the square root of x.
1237 
1238      The calculation is carried out in three steps.
1239 
1240      Step 1. Reduction.
1241      The input argument is scaled to the interval [1, 4) by
1242      computing
1243                x = 2^e * y, where y in [1,4).
1244      Furthermore y is decomposed as y = c + t where
1245                c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64.
1246 
1247      Step 2. Approximation.
1248      An approximation q = sqrt(1 + (t/c)) - 1  is obtained
1249      from a basic series expansion using precomputed values
1250      stored in rt_jby32_lead_table_float and rt_jby32_trail_table_float.
1251 
1252      Step 3. Reconstruction.
1253      The value of sqrt(x) is reconstructed via
1254        sqrt(x) = 2^(e/2) * sqrt(y)
1255                = 2^(e/2) * sqrt(c) * sqrt(y/c)
1256                = 2^(e/2) * sqrt(c) * sqrt(1 + t/c)
1257                = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ]
1258     */
1259 
1260   unsigned int ux, ax, u;
1261   float r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail;
1262   int e, denorm = 0, index;
1263 
1264 /* Arrays rt_jby32_lead_table_float and rt_jby32_trail_table_float contain
1265    leading and trailing parts respectively of precomputed
1266    values of sqrt(j/32), for j = 32, 33, ..., 128.
1267    rt_jby32_lead_table_float contains the first 13 bits of precision,
1268    and rt_jby32_trail_table_float contains a further 24 bits precision. */
1269 
1270 static const float rt_jby32_lead_table_float[97] = {
1271     1.00000000000000000000e+00F,   /* 0x3f800000 */
1272     1.01538085937500000000e+00F,   /* 0x3f81f800 */
1273     1.03076171875000000000e+00F,   /* 0x3f83f000 */
1274     1.04565429687500000000e+00F,   /* 0x3f85d800 */
1275     1.06054687500000000000e+00F,   /* 0x3f87c000 */
1276     1.07519531250000000000e+00F,   /* 0x3f89a000 */
1277     1.08959960937500000000e+00F,   /* 0x3f8b7800 */
1278     1.10375976562500000000e+00F,   /* 0x3f8d4800 */
1279     1.11791992187500000000e+00F,   /* 0x3f8f1800 */
1280     1.13183593750000000000e+00F,   /* 0x3f90e000 */
1281     1.14550781250000000000e+00F,   /* 0x3f92a000 */
1282     1.15917968750000000000e+00F,   /* 0x3f946000 */
1283     1.17236328125000000000e+00F,   /* 0x3f961000 */
1284     1.18579101562500000000e+00F,   /* 0x3f97c800 */
1285     1.19873046875000000000e+00F,   /* 0x3f997000 */
1286     1.21191406250000000000e+00F,   /* 0x3f9b2000 */
1287     1.22460937500000000000e+00F,   /* 0x3f9cc000 */
1288     1.23730468750000000000e+00F,   /* 0x3f9e6000 */
1289     1.25000000000000000000e+00F,   /* 0x3fa00000 */
1290     1.26220703125000000000e+00F,   /* 0x3fa19000 */
1291     1.27465820312500000000e+00F,   /* 0x3fa32800 */
1292     1.28686523437500000000e+00F,   /* 0x3fa4b800 */
1293     1.29882812500000000000e+00F,   /* 0x3fa64000 */
1294     1.31079101562500000000e+00F,   /* 0x3fa7c800 */
1295     1.32275390625000000000e+00F,   /* 0x3fa95000 */
1296     1.33447265625000000000e+00F,   /* 0x3faad000 */
1297     1.34619140625000000000e+00F,   /* 0x3fac5000 */
1298     1.35766601562500000000e+00F,   /* 0x3fadc800 */
1299     1.36914062500000000000e+00F,   /* 0x3faf4000 */
1300     1.38061523437500000000e+00F,   /* 0x3fb0b800 */
1301     1.39184570312500000000e+00F,   /* 0x3fb22800 */
1302     1.40307617187500000000e+00F,   /* 0x3fb39800 */
1303     1.41406250000000000000e+00F,   /* 0x3fb50000 */
1304     1.42504882812500000000e+00F,   /* 0x3fb66800 */
1305     1.43603515625000000000e+00F,   /* 0x3fb7d000 */
1306     1.44677734375000000000e+00F,   /* 0x3fb93000 */
1307     1.45751953125000000000e+00F,   /* 0x3fba9000 */
1308     1.46826171875000000000e+00F,   /* 0x3fbbf000 */
1309     1.47900390625000000000e+00F,   /* 0x3fbd5000 */
1310     1.48950195312500000000e+00F,   /* 0x3fbea800 */
1311     1.50000000000000000000e+00F,   /* 0x3fc00000 */
1312     1.51025390625000000000e+00F,   /* 0x3fc15000 */
1313     1.52050781250000000000e+00F,   /* 0x3fc2a000 */
1314     1.53076171875000000000e+00F,   /* 0x3fc3f000 */
1315     1.54101562500000000000e+00F,   /* 0x3fc54000 */
1316     1.55102539062500000000e+00F,   /* 0x3fc68800 */
1317     1.56103515625000000000e+00F,   /* 0x3fc7d000 */
1318     1.57104492187500000000e+00F,   /* 0x3fc91800 */
1319     1.58105468750000000000e+00F,   /* 0x3fca6000 */
1320     1.59082031250000000000e+00F,   /* 0x3fcba000 */
1321     1.60058593750000000000e+00F,   /* 0x3fcce000 */
1322     1.61035156250000000000e+00F,   /* 0x3fce2000 */
1323     1.62011718750000000000e+00F,   /* 0x3fcf6000 */
1324     1.62963867187500000000e+00F,   /* 0x3fd09800 */
1325     1.63916015625000000000e+00F,   /* 0x3fd1d000 */
1326     1.64868164062500000000e+00F,   /* 0x3fd30800 */
1327     1.65820312500000000000e+00F,   /* 0x3fd44000 */
1328     1.66748046875000000000e+00F,   /* 0x3fd57000 */
1329     1.67700195312500000000e+00F,   /* 0x3fd6a800 */
1330     1.68627929687500000000e+00F,   /* 0x3fd7d800 */
1331     1.69555664062500000000e+00F,   /* 0x3fd90800 */
1332     1.70458984375000000000e+00F,   /* 0x3fda3000 */
1333     1.71386718750000000000e+00F,   /* 0x3fdb6000 */
1334     1.72290039062500000000e+00F,   /* 0x3fdc8800 */
1335     1.73193359375000000000e+00F,   /* 0x3fddb000 */
1336     1.74096679687500000000e+00F,   /* 0x3fded800 */
1337     1.75000000000000000000e+00F,   /* 0x3fe00000 */
1338     1.75878906250000000000e+00F,   /* 0x3fe12000 */
1339     1.76757812500000000000e+00F,   /* 0x3fe24000 */
1340     1.77636718750000000000e+00F,   /* 0x3fe36000 */
1341     1.78515625000000000000e+00F,   /* 0x3fe48000 */
1342     1.79394531250000000000e+00F,   /* 0x3fe5a000 */
1343     1.80273437500000000000e+00F,   /* 0x3fe6c000 */
1344     1.81127929687500000000e+00F,   /* 0x3fe7d800 */
1345     1.81982421875000000000e+00F,   /* 0x3fe8f000 */
1346     1.82836914062500000000e+00F,   /* 0x3fea0800 */
1347     1.83691406250000000000e+00F,   /* 0x3feb2000 */
1348     1.84545898437500000000e+00F,   /* 0x3fec3800 */
1349     1.85400390625000000000e+00F,   /* 0x3fed5000 */
1350     1.86230468750000000000e+00F,   /* 0x3fee6000 */
1351     1.87060546875000000000e+00F,   /* 0x3fef7000 */
1352     1.87915039062500000000e+00F,   /* 0x3ff08800 */
1353     1.88745117187500000000e+00F,   /* 0x3ff19800 */
1354     1.89550781250000000000e+00F,   /* 0x3ff2a000 */
1355     1.90380859375000000000e+00F,   /* 0x3ff3b000 */
1356     1.91210937500000000000e+00F,   /* 0x3ff4c000 */
1357     1.92016601562500000000e+00F,   /* 0x3ff5c800 */
1358     1.92822265625000000000e+00F,   /* 0x3ff6d000 */
1359     1.93627929687500000000e+00F,   /* 0x3ff7d800 */
1360     1.94433593750000000000e+00F,   /* 0x3ff8e000 */
1361     1.95239257812500000000e+00F,   /* 0x3ff9e800 */
1362     1.96044921875000000000e+00F,   /* 0x3ffaf000 */
1363     1.96826171875000000000e+00F,   /* 0x3ffbf000 */
1364     1.97631835937500000000e+00F,   /* 0x3ffcf800 */
1365     1.98413085937500000000e+00F,   /* 0x3ffdf800 */
1366     1.99194335937500000000e+00F,   /* 0x3ffef800 */
1367     2.00000000000000000000e+00F};  /* 0x40000000 */
1368 
1369 static const float rt_jby32_trail_table_float[97] = {
1370     0.00000000000000000000e+00F,   /* 0x00000000 */
1371     1.23941208585165441036e-04F,   /* 0x3901f637 */
1372     1.46876545841223560274e-05F,   /* 0x37766aff */
1373     1.70736297150142490864e-04F,   /* 0x393307ad */
1374     1.13296780909877270460e-04F,   /* 0x38ed99bf */
1375     9.53458802541717886925e-05F,   /* 0x38c7f46e */
1376     1.25126505736261606216e-04F,   /* 0x39033464 */
1377     2.10342666832730174065e-04F,   /* 0x395c8f6e */
1378     1.14066875539720058441e-04F,   /* 0x38ef3730 */
1379     8.72047676239162683487e-05F,   /* 0x38b6e1b4 */
1380     1.36111237225122749805e-04F,   /* 0x390eb915 */
1381     2.26244374061934649944e-05F,   /* 0x37bdc99c */
1382     2.40658700931817293167e-04F,   /* 0x397c5954 */
1383     6.31069415248930454254e-05F,   /* 0x38845848 */
1384     2.27412077947519719601e-04F,   /* 0x396e7577 */
1385     5.90185391047270968556e-06F,   /* 0x36c6088a */
1386     1.35496389702893793583e-04F,   /* 0x390e1409 */
1387     1.32179571664892137051e-04F,   /* 0x390a99af */
1388     0.00000000000000000000e+00F,   /* 0x00000000 */
1389     2.31086043640971183777e-04F,   /* 0x39724fb0 */
1390     9.66752704698592424393e-05F,   /* 0x38cabe24 */
1391     8.85332483449019491673e-05F,   /* 0x38b9aaed */
1392     2.09980673389509320259e-04F,   /* 0x395c2e42 */
1393     2.20044588786549866199e-04F,   /* 0x3966bbc5 */
1394     1.21749282698146998882e-04F,   /* 0x38ff53a6 */
1395     1.62125259521417319775e-04F,   /* 0x392a002b */
1396     9.97955357888713479042e-05F,   /* 0x38d14952 */
1397     1.81545779923908412457e-04F,   /* 0x393e5d53 */
1398     1.65768768056295812130e-04F,   /* 0x392dd237 */
1399     5.48927710042335093021e-05F,   /* 0x38663caa */
1400     9.53875860432162880898e-05F,   /* 0x38c80ad2 */
1401     4.53481625299900770187e-05F,   /* 0x383e3438 */
1402     1.51062369695864617825e-04F,   /* 0x391e667f */
1403     1.70453247847035527229e-04F,   /* 0x3932bbb2 */
1404     1.05505387182347476482e-04F,   /* 0x38dd42c6 */
1405     2.02269104192964732647e-04F,   /* 0x39541833 */
1406     2.18442466575652360916e-04F,   /* 0x39650db4 */
1407     1.55796806211583316326e-04F,   /* 0x39235d63 */
1408     1.60395247803535312414e-05F,   /* 0x37868c9e */
1409     4.49578510597348213196e-05F,   /* 0x383c9120 */
1410     0.00000000000000000000e+00F,   /* 0x00000000 */
1411     1.26840444863773882389e-04F,   /* 0x39050079 */
1412     1.82820076588541269302e-04F,   /* 0x393fb364 */
1413     1.69370483490638434887e-04F,   /* 0x3931990b */
1414     8.78757418831810355186e-05F,   /* 0x38b849ee */
1415     1.83815121999941766262e-04F,   /* 0x3940be7f */
1416     2.14343352126888930798e-04F,   /* 0x3960c15b */
1417     1.80714370799250900745e-04F,   /* 0x393d7e25 */
1418     8.41425862745381891727e-05F,   /* 0x38b075b5 */
1419     1.69945167726837098598e-04F,   /* 0x3932334f */
1420     1.95121858268976211548e-04F,   /* 0x394c99a0 */
1421     1.60778334247879683971e-04F,   /* 0x3928969b */
1422     6.79871009197086095810e-05F,   /* 0x388e944c */
1423     1.61929419846273958683e-04F,   /* 0x3929cb99 */
1424     1.99474830878898501396e-04F,   /* 0x39512a1e */
1425     1.81604162207804620266e-04F,   /* 0x393e6cff */
1426     1.09270178654696792364e-04F,   /* 0x38e527fb */
1427     2.27539261686615645885e-04F,   /* 0x396e979b */
1428     4.90300008095800876617e-05F,   /* 0x384da590 */
1429     6.28985289949923753738e-05F,   /* 0x3883e864 */
1430     2.58551553997676819563e-05F,   /* 0x37d8e386 */
1431     1.82868374395184218884e-04F,   /* 0x393fc05b */
1432     4.64625991298817098141e-05F,   /* 0x3842e0d6 */
1433     1.05703387816902250051e-04F,   /* 0x38ddad13 */
1434     1.17213814519345760345e-04F,   /* 0x38f5d0b0 */
1435     8.17377731436863541603e-05F,   /* 0x38ab6aa2 */
1436     0.00000000000000000000e+00F,   /* 0x00000000 */
1437     1.16847433673683553934e-04F,   /* 0x38f50bfd */
1438     1.88827965757809579372e-04F,   /* 0x3946001f */
1439     2.16612941585481166840e-04F,   /* 0x39632298 */
1440     2.00857131858356297016e-04F,   /* 0x39529d2d */
1441     1.42199307447299361229e-04F,   /* 0x39151b56 */
1442     4.12627305195201188326e-05F,   /* 0x382d1185 */
1443     1.42796401632949709892e-04F,   /* 0x3915bb9e */
1444     2.03253570361994206905e-04F,   /* 0x39552077 */
1445     2.23214170546270906925e-04F,   /* 0x396a0e99 */
1446     2.03244591830298304558e-04F,   /* 0x39551e0e */
1447     1.43898156238719820976e-04F,   /* 0x3916e35e */
1448     4.57155256299301981926e-05F,   /* 0x383fbeac */
1449     1.53365719597786664963e-04F,   /* 0x3920d0cc */
1450     2.23224633373320102692e-04F,   /* 0x396a1168 */
1451     1.16566716314991936088e-05F,   /* 0x37439106 */
1452     7.43694272387074306607e-06F,   /* 0x36f98ada */
1453     2.11048507480882108212e-04F,   /* 0x395d4ce7 */
1454     1.34682719362899661064e-04F,   /* 0x390d399e */
1455     2.29425968427676707506e-05F,   /* 0x37c074da */
1456     1.20421340398024767637e-04F,   /* 0x38fc8ab7 */
1457     1.83421318070031702518e-04F,   /* 0x394054c9 */
1458     2.12376224226318299770e-04F,   /* 0x395eb14f */
1459     2.07710763788782060146e-04F,   /* 0x3959ccef */
1460     1.69840845046564936638e-04F,   /* 0x3932174e */
1461     9.91739216260612010956e-05F,   /* 0x38cffb98 */
1462     2.40249748458154499531e-04F,   /* 0x397beb8d */
1463     1.05178231024183332920e-04F,   /* 0x38dc9322 */
1464     1.82623916771262884140e-04F,   /* 0x393f7ebc */
1465     2.28821940254420042038e-04F,   /* 0x396fefec */
1466     0.00000000000000000000e+00F};  /* 0x00000000 */
1467 
1468 
1469 /* Handle special arguments first */
1470 
1471   GET_BITS_SP32(x, ux);
1472   ax = ux & (~SIGNBIT_SP32);
1473 
1474   if(ax >= 0x7f800000)
1475     {
1476       /* x is either NaN or infinity */
1477       if (ux & MANTBITS_SP32)
1478         /* x is NaN */
1479         return x + x; /* Raise invalid if it is a signalling NaN */
1480       else if (ux & SIGNBIT_SP32)
1481         return nanf_with_flags(AMD_F_INVALID);
1482       else
1483         /* x is positive infinity */
1484         return x;
1485     }
1486   else if (ux & SIGNBIT_SP32)
1487     {
1488       /* x is negative. */
1489       if (x == 0.0F)
1490         /* Handle negative zero first */
1491         return x;
1492       else
1493         return nanf_with_flags(AMD_F_INVALID);
1494     }
1495   else if (ux <= 0x007fffff)
1496     {
1497       /* x is denormalised or zero */
1498       if (ux == 0)
1499         /* x is zero */
1500         return x;
1501       else
1502         {
1503           /* x is denormalised; scale it up */
1504           /* Normalize x by increasing the exponent by 26
1505              and subtracting a correction to account for the implicit
1506              bit. This replaces a slow denormalized
1507              multiplication by a fast normal subtraction. */
1508           static const float corr = 7.888609052210118054e-31F; /* 0x0d800000 */
1509           denorm = 1;
1510           GET_BITS_SP32(x, ux);
1511           PUT_BITS_SP32(ux | 0x0d800000, x);
1512           x -= corr;
1513           GET_BITS_SP32(x, ux);
1514         }
1515     }
1516 
1517   /* Main algorithm */
1518 
1519   /*
1520      Find y and e such that x = 2^e * y, where y in [1,4).
1521      This is done using an in-lined variant of splitFloat,
1522      which also ensures that e is even.
1523    */
1524   y = x;
1525   ux &= EXPBITS_SP32;
1526   ux >>= EXPSHIFTBITS_SP32;
1527   if (ux & 1)
1528     {
1529       GET_BITS_SP32(y, u);
1530       u &= (SIGNBIT_SP32 | MANTBITS_SP32);
1531       u |= ONEEXPBITS_SP32;
1532       PUT_BITS_SP32(u, y);
1533       e = ux - EXPBIAS_SP32;
1534     }
1535   else
1536     {
1537       GET_BITS_SP32(y, u);
1538       u &= (SIGNBIT_SP32 | MANTBITS_SP32);
1539       u |= TWOEXPBITS_SP32;
1540       PUT_BITS_SP32(u, y);
1541       e = ux - EXPBIAS_SP32 - 1;
1542     }
1543 
1544   /* Find the index of the sub-interval of [1,4) in which y lies. */
1545 
1546   index = (int)(32.0F*y+0.5);
1547 
1548   /* Look up the table values and compute c and r = c/t */
1549 
1550   rtc_lead = rt_jby32_lead_table_float[index-32];
1551   rtc_trail = rt_jby32_trail_table_float[index-32];
1552   c = 0.03125F*index;
1553   r = (y - c)/c;
1554 
1555   /*
1556   Find q = sqrt(1+r) - 1.
1557   From one step of Newton on (q+1)^2 = 1+r
1558   */
1559 
1560   p = r*0.5F - r*r*(0.1250079870F - r*(0.6250522999e-01F));
1561   twop = p + p;
1562   q = p - (p*p + (twop - r))/(twop + 2.0);
1563 
1564   /* Reconstruction */
1565 
1566   rtc = rtc_lead + rtc_trail;
1567   e >>= 1; /* e = e/2 */
1568   z = rtc_lead + (rtc*q+rtc_trail);
1569 
1570   if (denorm)
1571     {
1572       /* Scale by 2**(e-13) */
1573       PUT_BITS_SP32(((e - 13) + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r);
1574       z *= r;
1575     }
1576   else
1577     {
1578       /* Scale by 2**e */
1579       PUT_BITS_SP32((e + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r);
1580       z *= r;
1581     }
1582 
1583   return z;
1584 
1585 }
1586 #endif /* SQRTF_AMD_INLINE */
1587 
1588 #ifdef USE_LOG_KERNEL_AMD
log_kernel_amd64(double x,unsigned long long ux,int * xexp,double * r1,double * r2)1589 static inline void log_kernel_amd64(double x, unsigned long long ux, int *xexp, double *r1, double *r2)
1590 {
1591 
1592   int expadjust;
1593   double r, z1, z2, correction, f, f1, f2, q, u, v, poly;
1594   int index;
1595 
1596   /*
1597     Computes natural log(x). Algorithm based on:
1598     Ping-Tak Peter Tang
1599     "Table-driven implementation of the logarithm function in IEEE
1600     floating-point arithmetic"
1601     ACM Transactions on Mathematical Software (TOMS)
1602     Volume 16, Issue 4 (December 1990)
1603   */
1604 
1605 /* Arrays ln_lead_table and ln_tail_table contain
1606    leading and trailing parts respectively of precomputed
1607    values of natural log(1+i/64), for i = 0, 1, ..., 64.
1608    ln_lead_table contains the first 24 bits of precision,
1609    and ln_tail_table contains a further 53 bits precision. */
1610 
1611   static const double ln_lead_table[65] = {
1612     0.00000000000000000000e+00,   /* 0x0000000000000000 */
1613     1.55041813850402832031e-02,   /* 0x3f8fc0a800000000 */
1614     3.07716131210327148438e-02,   /* 0x3f9f829800000000 */
1615     4.58095073699951171875e-02,   /* 0x3fa7745800000000 */
1616     6.06245994567871093750e-02,   /* 0x3faf0a3000000000 */
1617     7.52233862876892089844e-02,   /* 0x3fb341d700000000 */
1618     8.96121263504028320312e-02,   /* 0x3fb6f0d200000000 */
1619     1.03796780109405517578e-01,   /* 0x3fba926d00000000 */
1620     1.17783010005950927734e-01,   /* 0x3fbe270700000000 */
1621     1.31576299667358398438e-01,   /* 0x3fc0d77e00000000 */
1622     1.45181953907012939453e-01,   /* 0x3fc2955280000000 */
1623     1.58604979515075683594e-01,   /* 0x3fc44d2b00000000 */
1624     1.71850204467773437500e-01,   /* 0x3fc5ff3000000000 */
1625     1.84922337532043457031e-01,   /* 0x3fc7ab8900000000 */
1626     1.97825729846954345703e-01,   /* 0x3fc9525a80000000 */
1627     2.10564732551574707031e-01,   /* 0x3fcaf3c900000000 */
1628     2.23143517971038818359e-01,   /* 0x3fcc8ff780000000 */
1629     2.35566020011901855469e-01,   /* 0x3fce270700000000 */
1630     2.47836112976074218750e-01,   /* 0x3fcfb91800000000 */
1631     2.59957492351531982422e-01,   /* 0x3fd0a324c0000000 */
1632     2.71933674812316894531e-01,   /* 0x3fd1675c80000000 */
1633     2.83768117427825927734e-01,   /* 0x3fd22941c0000000 */
1634     2.95464158058166503906e-01,   /* 0x3fd2e8e280000000 */
1635     3.07025015354156494141e-01,   /* 0x3fd3a64c40000000 */
1636     3.18453729152679443359e-01,   /* 0x3fd4618bc0000000 */
1637     3.29753279685974121094e-01,   /* 0x3fd51aad80000000 */
1638     3.40926527976989746094e-01,   /* 0x3fd5d1bd80000000 */
1639     3.51976394653320312500e-01,   /* 0x3fd686c800000000 */
1640     3.62905442714691162109e-01,   /* 0x3fd739d7c0000000 */
1641     3.73716354370117187500e-01,   /* 0x3fd7eaf800000000 */
1642     3.84411692619323730469e-01,   /* 0x3fd89a3380000000 */
1643     3.94993782043457031250e-01,   /* 0x3fd9479400000000 */
1644     4.05465066432952880859e-01,   /* 0x3fd9f323c0000000 */
1645     4.15827870368957519531e-01,   /* 0x3fda9cec80000000 */
1646     4.26084339618682861328e-01,   /* 0x3fdb44f740000000 */
1647     4.36236739158630371094e-01,   /* 0x3fdbeb4d80000000 */
1648     4.46287095546722412109e-01,   /* 0x3fdc8ff7c0000000 */
1649     4.56237375736236572266e-01,   /* 0x3fdd32fe40000000 */
1650     4.66089725494384765625e-01,   /* 0x3fddd46a00000000 */
1651     4.75845873355865478516e-01,   /* 0x3fde744240000000 */
1652     4.85507786273956298828e-01,   /* 0x3fdf128f40000000 */
1653     4.95077252388000488281e-01,   /* 0x3fdfaf5880000000 */
1654     5.04556000232696533203e-01,   /* 0x3fe02552a0000000 */
1655     5.13945698738098144531e-01,   /* 0x3fe0723e40000000 */
1656     5.23248136043548583984e-01,   /* 0x3fe0be72e0000000 */
1657     5.32464742660522460938e-01,   /* 0x3fe109f380000000 */
1658     5.41597247123718261719e-01,   /* 0x3fe154c3c0000000 */
1659     5.50647079944610595703e-01,   /* 0x3fe19ee6a0000000 */
1660     5.59615731239318847656e-01,   /* 0x3fe1e85f40000000 */
1661     5.68504691123962402344e-01,   /* 0x3fe23130c0000000 */
1662     5.77315330505371093750e-01,   /* 0x3fe2795e00000000 */
1663     5.86049020290374755859e-01,   /* 0x3fe2c0e9e0000000 */
1664     5.94707071781158447266e-01,   /* 0x3fe307d720000000 */
1665     6.03290796279907226562e-01,   /* 0x3fe34e2880000000 */
1666     6.11801505088806152344e-01,   /* 0x3fe393e0c0000000 */
1667     6.20240390300750732422e-01,   /* 0x3fe3d90260000000 */
1668     6.28608644008636474609e-01,   /* 0x3fe41d8fe0000000 */
1669     6.36907458305358886719e-01,   /* 0x3fe4618bc0000000 */
1670     6.45137906074523925781e-01,   /* 0x3fe4a4f840000000 */
1671     6.53301239013671875000e-01,   /* 0x3fe4e7d800000000 */
1672     6.61398470401763916016e-01,   /* 0x3fe52a2d20000000 */
1673     6.69430613517761230469e-01,   /* 0x3fe56bf9c0000000 */
1674     6.77398800849914550781e-01,   /* 0x3fe5ad4040000000 */
1675     6.85303986072540283203e-01,   /* 0x3fe5ee02a0000000 */
1676     6.93147122859954833984e-01};  /* 0x3fe62e42e0000000 */
1677 
1678   static const double ln_tail_table[65] = {
1679     0.00000000000000000000e+00,   /* 0x0000000000000000 */
1680     5.15092497094772879206e-09,   /* 0x3e361f807c79f3db */
1681     4.55457209735272790188e-08,   /* 0x3e6873c1980267c8 */
1682     2.86612990859791781788e-08,   /* 0x3e5ec65b9f88c69e */
1683     2.23596477332056055352e-08,   /* 0x3e58022c54cc2f99 */
1684     3.49498983167142274770e-08,   /* 0x3e62c37a3a125330 */
1685     3.23392843005887000414e-08,   /* 0x3e615cad69737c93 */
1686     1.35722380472479366661e-08,   /* 0x3e4d256ab1b285e9 */
1687     2.56504325268044191098e-08,   /* 0x3e5b8abcb97a7aa2 */
1688     5.81213608741512136843e-08,   /* 0x3e6f34239659a5dc */
1689     5.59374849578288093334e-08,   /* 0x3e6e07fd48d30177 */
1690     5.06615629004996189970e-08,   /* 0x3e6b32df4799f4f6 */
1691     5.24588857848400955725e-08,   /* 0x3e6c29e4f4f21cf8 */
1692     9.61968535632653505972e-10,   /* 0x3e1086c848df1b59 */
1693     1.34829655346594463137e-08,   /* 0x3e4cf456b4764130 */
1694     3.65557749306383026498e-08,   /* 0x3e63a02ffcb63398 */
1695     3.33431709374069198903e-08,   /* 0x3e61e6a6886b0976 */
1696     5.13008650536088382197e-08,   /* 0x3e6b8abcb97a7aa2 */
1697     5.09285070380306053751e-08,   /* 0x3e6b578f8aa35552 */
1698     3.20853940845502057341e-08,   /* 0x3e6139c871afb9fc */
1699     4.06713248643004200446e-08,   /* 0x3e65d5d30701ce64 */
1700     5.57028186706125221168e-08,   /* 0x3e6de7bcb2d12142 */
1701     5.48356693724804282546e-08,   /* 0x3e6d708e984e1664 */
1702     1.99407553679345001938e-08,   /* 0x3e556945e9c72f36 */
1703     1.96585517245087232086e-09,   /* 0x3e20e2f613e85bda */
1704     6.68649386072067321503e-09,   /* 0x3e3cb7e0b42724f6 */
1705     5.89936034642113390002e-08,   /* 0x3e6fac04e52846c7 */
1706     2.85038578721554472484e-08,   /* 0x3e5e9b14aec442be */
1707     5.09746772910284482606e-08,   /* 0x3e6b5de8034e7126 */
1708     5.54234668933210171467e-08,   /* 0x3e6dc157e1b259d3 */
1709     6.29100830926604004874e-09,   /* 0x3e3b05096ad69c62 */
1710     2.61974119468563937716e-08,   /* 0x3e5c2116faba4cdd */
1711     4.16752115011186398935e-08,   /* 0x3e665fcc25f95b47 */
1712     2.47747534460820790327e-08,   /* 0x3e5a9a08498d4850 */
1713     5.56922172017964209793e-08,   /* 0x3e6de647b1465f77 */
1714     2.76162876992552906035e-08,   /* 0x3e5da71b7bf7861d */
1715     7.08169709942321478061e-09,   /* 0x3e3e6a6886b09760 */
1716     5.77453510221151779025e-08,   /* 0x3e6f0075eab0ef64 */
1717     4.43021445893361960146e-09,   /* 0x3e33071282fb989b */
1718     3.15140984357495864573e-08,   /* 0x3e60eb43c3f1bed2 */
1719     2.95077445089736670973e-08,   /* 0x3e5faf06ecb35c84 */
1720     1.44098510263167149349e-08,   /* 0x3e4ef1e63db35f68 */
1721     1.05196987538551827693e-08,   /* 0x3e469743fb1a71a5 */
1722     5.23641361722697546261e-08,   /* 0x3e6c1cdf404e5796 */
1723     7.72099925253243069458e-09,   /* 0x3e4094aa0ada625e */
1724     5.62089493829364197156e-08,   /* 0x3e6e2d4c96fde3ec */
1725     3.53090261098577946927e-08,   /* 0x3e62f4d5e9a98f34 */
1726     3.80080516835568242269e-08,   /* 0x3e6467c96ecc5cbe */
1727     5.66961038386146408282e-08,   /* 0x3e6e7040d03dec5a */
1728     4.42287063097349852717e-08,   /* 0x3e67bebf4282de36 */
1729     3.45294525105681104660e-08,   /* 0x3e6289b11aeb783f */
1730     2.47132034530447431509e-08,   /* 0x3e5a891d1772f538 */
1731     3.59655343422487209774e-08,   /* 0x3e634f10be1fb591 */
1732     5.51581770357780862071e-08,   /* 0x3e6d9ce1d316eb93 */
1733     3.60171867511861372793e-08,   /* 0x3e63562a19a9c442 */
1734     1.94511067964296180547e-08,   /* 0x3e54e2adf548084c */
1735     1.54137376631349347838e-08,   /* 0x3e508ce55cc8c97a */
1736     3.93171034490174464173e-09,   /* 0x3e30e2f613e85bda */
1737     5.52990607758839766440e-08,   /* 0x3e6db03ebb0227bf */
1738     3.29990737637586136511e-08,   /* 0x3e61b75bb09cb098 */
1739     1.18436010922446096216e-08,   /* 0x3e496f16abb9df22 */
1740     4.04248680368301346709e-08,   /* 0x3e65b3f399411c62 */
1741     2.27418915900284316293e-08,   /* 0x3e586b3e59f65355 */
1742     1.70263791333409206020e-08,   /* 0x3e52482ceae1ac12 */
1743     5.76999904754328540596e-08};  /* 0x3e6efa39ef35793c */
1744 
1745   /* Approximating polynomial coefficients for x near 1.0 */
1746   static const double
1747     ca_1 = 8.33333333333317923934e-02,  /* 0x3fb55555555554e6 */
1748     ca_2 = 1.25000000037717509602e-02,  /* 0x3f89999999bac6d4 */
1749     ca_3 = 2.23213998791944806202e-03,  /* 0x3f62492307f1519f */
1750     ca_4 = 4.34887777707614552256e-04;  /* 0x3f3c8034c85dfff0 */
1751 
1752   /* Approximating polynomial coefficients for other x */
1753   static const double
1754     cb_1 = 8.33333333333333593622e-02,  /* 0x3fb5555555555557 */
1755     cb_2 = 1.24999999978138668903e-02,  /* 0x3f89999999865ede */
1756     cb_3 = 2.23219810758559851206e-03;  /* 0x3f6249423bd94741 */
1757 
1758   static const unsigned long long
1759     log_thresh1 = 0x3fee0faa00000000,
1760     log_thresh2 = 0x3ff1082c00000000;
1761 
1762   /* log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000
1763      log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 */
1764   if (ux >= log_thresh1 && ux <= log_thresh2)
1765     {
1766       /* Arguments close to 1.0 are handled separately to maintain
1767          accuracy.
1768 
1769          The approximation in this region exploits the identity
1770              log( 1 + r ) = log( 1 + u/2 )  /  log( 1 - u/2 ), where
1771              u  = 2r / (2+r).
1772          Note that the right hand side has an odd Taylor series expansion
1773          which converges much faster than the Taylor series expansion of
1774          log( 1 + r ) in r. Thus, we approximate log( 1 + r ) by
1775              u + A1 * u^3 + A2 * u^5 + ... + An * u^(2n+1).
1776 
1777          One subtlety is that since u cannot be calculated from
1778          r exactly, the rounding error in the first u should be
1779          avoided if possible. To accomplish this, we observe that
1780                        u  =  r  -  r*r/(2+r).
1781          Since x (=1+r) is the input argument, and thus presumed exact,
1782          the formula above approximates u accurately because
1783                        u  =  r  -  correction,
1784          and the magnitude of "correction" (of the order of r*r)
1785          is small.
1786          With these observations, we will approximate log( 1 + r ) by
1787             r + (  (A1*u^3 + ... + An*u^(2n+1)) - correction ).
1788 
1789          We approximate log(1+r) by an odd polynomial in u, where
1790                   u = 2r/(2+r) = r - r*r/(2+r).
1791       */
1792       r = x - 1.0;
1793       u = r / (2.0 + r);
1794       correction = r * u;
1795       u = u + u;
1796       v = u * u;
1797       z1 = r;
1798       z2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction);
1799       *r1 = z1;
1800       *r2 = z2;
1801       *xexp = 0;
1802     }
1803   else
1804     {
1805       /*
1806         First, we decompose the argument x to the form
1807         x  =  2**M  *  (F1  +  F2),
1808         where  1 <= F1+F2 < 2, M has the value of an integer,
1809         F1 = 1 + j/64, j ranges from 0 to 64, and |F2| <= 1/128.
1810 
1811         Second, we approximate log( 1 + F2/F1 ) by an odd polynomial
1812         in U, where U  =  2 F2 / (2 F2 + F1).
1813         Note that log( 1 + F2/F1 ) = log( 1 + U/2 ) - log( 1 - U/2 ).
1814         The core approximation calculates
1815         Poly = [log( 1 + U/2 ) - log( 1 - U/2 )]/U   -   1.
1816         Note that  log(1 + U/2) - log(1 - U/2) = 2 arctanh ( U/2 ),
1817         thus, Poly =  2 arctanh( U/2 ) / U  -  1.
1818 
1819         It is not hard to see that
1820           log(x) = M*log(2) + log(F1) + log( 1 + F2/F1 ).
1821         Hence, we return Z1 = log(F1), and  Z2 = log( 1 + F2/F1).
1822         The values of log(F1) are calculated beforehand and stored
1823         in the program.
1824       */
1825 
1826       f = x;
1827       if (ux < IMPBIT_DP64)
1828         {
1829           /* The input argument x is denormalized */
1830           /* Normalize f by increasing the exponent by 60
1831              and subtracting a correction to account for the implicit
1832              bit. This replaces a slow denormalized
1833              multiplication by a fast normal subtraction. */
1834           static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */
1835           GET_BITS_DP64(f, ux);
1836           ux |= 0x03d0000000000000;
1837           PUT_BITS_DP64(ux, f);
1838           f -= corr;
1839           GET_BITS_DP64(f, ux);
1840           expadjust = 60;
1841         }
1842       else
1843         expadjust = 0;
1844 
1845       /* Store the exponent of x in xexp and put
1846          f into the range [0.5,1) */
1847       *xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 - expadjust;
1848       PUT_BITS_DP64((ux & MANTBITS_DP64) | HALFEXPBITS_DP64, f);
1849 
1850       /* Now  x = 2**xexp  * f,  1/2 <= f < 1. */
1851 
1852       /* Set index to be the nearest integer to 128*f */
1853       r = 128.0 * f;
1854       index = (int)(r + 0.5);
1855 
1856       z1 = ln_lead_table[index-64];
1857       q = ln_tail_table[index-64];
1858       f1 = index * 0.0078125; /* 0.0078125 = 1/128 */
1859       f2 = f - f1;
1860       /* At this point, x = 2**xexp * ( f1  +  f2 ) where
1861          f1 = j/128, j = 64, 65, ..., 128 and |f2| <= 1/256. */
1862 
1863       /* Calculate u = 2 f2 / ( 2 f1 + f2 ) = f2 / ( f1 + 0.5*f2 ) */
1864       /* u = f2 / (f1 + 0.5 * f2); */
1865       u = f2 / (f1 + 0.5 * f2);
1866 
1867       /* Here, |u| <= 2(exp(1/16)-1) / (exp(1/16)+1).
1868          The core approximation calculates
1869          poly = [log(1 + u/2) - log(1 - u/2)]/u  -  1  */
1870       v = u * u;
1871       poly = (v * (cb_1 + v * (cb_2 + v * cb_3)));
1872       z2 = q + (u + u * poly);
1873       *r1 = z1;
1874       *r2 = z2;
1875     }
1876   return;
1877 }
1878 #endif /* USE_LOG_KERNEL_AMD */
1879 
1880 #if defined(USE_REMAINDER_PIBY2F_INLINE)
1881 /* Define this to get debugging print statements activated */
1882 #define DEBUGGING_PRINT
1883 #undef DEBUGGING_PRINT
1884 
1885 
1886 #ifdef DEBUGGING_PRINT
1887 #include <stdio.h>
d2b(long long d,int bitsper,int point)1888 char *d2b(long long d, int bitsper, int point)
1889 {
1890   static char buff[200];
1891   int i, j;
1892   j = bitsper;
1893   if (point >= 0 && point <= bitsper)
1894     j++;
1895   buff[j] = '\0';
1896   for (i = bitsper - 1; i >= 0; i--)
1897     {
1898       j--;
1899       if (d % 2 == 1)
1900         buff[j] = '1';
1901       else
1902         buff[j] = '0';
1903       if (i == point)
1904         {
1905           j--;
1906           buff[j] = '.';
1907         }
1908       d /= 2;
1909     }
1910   return buff;
1911 }
1912 #endif
1913 
1914 /* Given positive argument x, reduce it to the range [-pi/4,pi/4] using
1915    extra precision, and return the result in r.
1916    Return value "region" tells how many lots of pi/2 were subtracted
1917    from x to put it in the range [-pi/4,pi/4], mod 4. */
__remainder_piby2f_inline(unsigned long long ux,double * r,int * region)1918 static inline void __remainder_piby2f_inline(unsigned long long ux, double *r, int *region)
1919 {
1920 
1921       /* This method simulates multi-precision floating-point
1922          arithmetic and is accurate for all 1 <= x < infinity */
1923 #if 0
1924       const int bitsper = 36;
1925 #else
1926 #define bitsper 36
1927 #endif
1928       unsigned long long res[10];
1929       unsigned long long u, carry, mask, mant, nextbits;
1930       int first, last, i, rexp, xexp, resexp, ltb, determ, bc;
1931       double dx;
1932       static const double
1933         piby2 = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */
1934       static unsigned long long pibits[] =
1935       {
1936         0LL,
1937         5215LL, 13000023176LL, 11362338026LL, 67174558139LL,
1938         34819822259LL, 10612056195LL, 67816420731LL, 57840157550LL,
1939         19558516809LL, 50025467026LL, 25186875954LL, 18152700886LL
1940       };
1941 
1942       xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64);
1943       ux = ((ux & MANTBITS_DP64) | IMPBIT_DP64) >> 29;
1944 
1945 
1946       /* Now ux is the mantissa bit pattern of x as a long long integer */
1947       mask = 1;
1948       mask = (mask << bitsper) - 1;
1949 
1950       /* Set first and last to the positions of the first
1951          and last chunks of 2/pi that we need */
1952       first = xexp / bitsper;
1953       resexp = xexp - first * bitsper;
1954       /* 120 is the theoretical maximum number of bits (actually
1955          115 for IEEE single precision) that we need to extract
1956          from the middle of 2/pi to compute the reduced argument
1957          accurately enough for our purposes */
1958       last = first + 120 / bitsper;
1959 
1960 #ifdef DEBUGGING_PRINT
1961       printf("first = %d, last = %d\n", first, last);
1962 #endif
1963 
1964       /* Do a long multiplication of the bits of 2/pi by the
1965          integer mantissa */
1966 #if 0
1967       for (i = last; i >= first; i--)
1968         {
1969           u = pibits[i] * ux + carry;
1970           res[i - first] = u & mask;
1971           carry = u >> bitsper;
1972         }
1973       res[last - first + 1] = 0;
1974 #else
1975       /* Unroll the loop. This is only correct because we know
1976          that bitsper is fixed as 36. */
1977       res[4] = 0;
1978       u = pibits[last] * ux;
1979       res[3] = u & mask;
1980       carry = u >> bitsper;
1981       u = pibits[last - 1] * ux + carry;
1982       res[2] = u & mask;
1983       carry = u >> bitsper;
1984       u = pibits[last - 2] * ux + carry;
1985       res[1] = u & mask;
1986       carry = u >> bitsper;
1987       u = pibits[first] * ux + carry;
1988       res[0] = u & mask;
1989 #endif
1990 
1991 #ifdef DEBUGGING_PRINT
1992       printf("resexp = %d\n", resexp);
1993       printf("Significant part of x * 2/pi with binary"
1994              " point in correct place:\n");
1995       for (i = 0; i <= last - first; i++)
1996         {
1997           if (i > 0 && i % 5 == 0)
1998             printf("\n ");
1999           if (i == 1)
2000             printf("%s ", d2b(res[i], bitsper, resexp));
2001           else
2002             printf("%s ", d2b(res[i], bitsper, -1));
2003         }
2004       printf("\n");
2005 #endif
2006 
2007       /* Reconstruct the result */
2008       ltb = (int)((((res[0] << bitsper) | res[1])
2009                    >> (bitsper - 1 - resexp)) & 7);
2010 
2011       /* determ says whether the fractional part is >= 0.5 */
2012       determ = ltb & 1;
2013 
2014 #ifdef DEBUGGING_PRINT
2015       printf("ltb = %d (last two bits before binary point"
2016              " and first bit after)\n", ltb);
2017       printf("determ = %d (1 means need to negate because the fractional\n"
2018              "            part of x * 2/pi is greater than 0.5)\n", determ);
2019 #endif
2020 
2021       i = 1;
2022       if (determ)
2023         {
2024           /* The mantissa is >= 0.5. We want to subtract it
2025              from 1.0 by negating all the bits */
2026           *region = ((ltb >> 1) + 1) & 3;
2027           mant = 1;
2028           mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1);
2029           while (mant < 0x0000000000010000)
2030             {
2031               i++;
2032               mant = (mant << bitsper) | (~(res[i]) & mask);
2033             }
2034           nextbits = (~(res[i+1]) & mask);
2035         }
2036       else
2037         {
2038           *region = (ltb >> 1);
2039           mant = 1;
2040           mant = res[1] & ((mant << (bitsper - resexp)) - 1);
2041           while (mant < 0x0000000000010000)
2042             {
2043               i++;
2044               mant = (mant << bitsper) | res[i];
2045             }
2046           nextbits = res[i+1];
2047         }
2048 
2049 #ifdef DEBUGGING_PRINT
2050       printf("First bits of mant = %s\n", d2b(mant, bitsper, -1));
2051 #endif
2052 
2053       /* Normalize the mantissa. The shift value 6 here, determined by
2054          trial and error, seems to give optimal speed. */
2055       bc = 0;
2056       while (mant < 0x0000400000000000)
2057         {
2058           bc += 6;
2059           mant <<= 6;
2060         }
2061       while (mant < 0x0010000000000000)
2062         {
2063           bc++;
2064           mant <<= 1;
2065         }
2066       mant |= nextbits >> (bitsper - bc);
2067 
2068       rexp = 52 + resexp - bc - i * bitsper;
2069 
2070 #ifdef DEBUGGING_PRINT
2071       printf("Normalised mantissa = 0x%016lx\n", mant);
2072       printf("Exponent to be inserted on mantissa = rexp = %d\n", rexp);
2073 #endif
2074 
2075       /* Put the result exponent rexp onto the mantissa pattern */
2076       u = ((unsigned long long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64;
2077       ux = (mant & MANTBITS_DP64) | u;
2078       if (determ)
2079         /* If we negated the mantissa we negate x too */
2080         ux |= SIGNBIT_DP64;
2081       PUT_BITS_DP64(ux, dx);
2082 
2083 #ifdef DEBUGGING_PRINT
2084       printf("(x*2/pi) = %25.20e = %s\n", dx, double2hex(&dx));
2085 #endif
2086 
2087       /* x is a double precision version of the fractional part of
2088          x * 2 / pi. Multiply x by pi/2 in double precision
2089          to get the reduced argument r. */
2090       *r = dx * piby2;
2091 
2092 #ifdef DEBUGGING_PRINT
2093       printf(" r = frac(x*2/pi) * pi/2:\n");
2094       printf(" r = %25.20e = %s\n", *r, double2hex(r));
2095       printf("region = (number of pi/2 subtracted from x) mod 4 = %d\n",
2096              *region);
2097 #endif
2098 }
2099 #endif /* USE_REMAINDER_PIBY2F_INLINE */
2100 
2101 #endif /* LIBM_INLINES_AMD_H_INCLUDED */
2102