1 /***********************************************************************************/
2 /** MIT License **/
3 /** ----------- **/
4 /** **/
5 /** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/
6 /** **/
7 /** Permission is hereby granted, free of charge, to any person obtaining a copy **/
8 /** of this Software and associated documentaon files (the "Software"), to deal **/
9 /** in the Software without restriction, including without limitation the rights **/
10 /** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/
11 /** copies of the Software, and to permit persons to whom the Software is **/
12 /** furnished to do so, subject to the following conditions: **/
13 /** **/
14 /** The above copyright notice and this permission notice shall be included in **/
15 /** all copies or substantial portions of the Software. **/
16 /** **/
17 /** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/
18 /** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/
19 /** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/
20 /** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/
21 /** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/
22 /** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/
23 /** THE SOFTWARE. **/
24 /***********************************************************************************/
25
26 #ifndef LIBM_INLINES_AMD_H_INCLUDED
27 #define LIBM_INLINES_AMD_H_INCLUDED 1
28
29 #include "libm_util.h"
30
31 /* Set defines for inline functions calling other inlines */
32 #if defined(USE_VAL_WITH_FLAGS) || defined(USE_VALF_WITH_FLAGS) || \
33 defined(USE_ZERO_WITH_FLAGS) || defined(USE_ZEROF_WITH_FLAGS) || \
34 defined(USE_NAN_WITH_FLAGS) || defined(USE_NANF_WITH_FLAGS) || \
35 defined(USE_INDEFINITE_WITH_FLAGS) || defined(USE_INDEFINITEF_WITH_FLAGS) || \
36 defined(USE_INFINITY_WITH_FLAGS) || defined(USE_INFINITYF_WITH_FLAGS) || \
37 defined(USE_SQRT_AMD_INLINE) || defined(USE_SQRTF_AMD_INLINE) || \
38 (defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF))
39 #undef USE_RAISE_FPSW_FLAGS
40 #define USE_RAISE_FPSW_FLAGS 1
41 #endif
42
43 #if defined(USE_SPLITDOUBLE)
44 /* Splits double x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0.
45 Assumes that x is not zero, denormal, infinity or NaN, but these conditions
46 are not checked */
splitDouble(double x,int * e,double * m)47 static inline void splitDouble(double x, int *e, double *m)
48 {
49 unsigned long long ux, uy;
50 GET_BITS_DP64(x, ux);
51 uy = ux;
52 ux &= EXPBITS_DP64;
53 ux >>= EXPSHIFTBITS_DP64;
54 *e = (int)ux - EXPBIAS_DP64 + 1;
55 uy = (uy & (SIGNBIT_DP64 | MANTBITS_DP64)) | HALFEXPBITS_DP64;
56 PUT_BITS_DP64(uy, x);
57 *m = x;
58 }
59 #endif /* USE_SPLITDOUBLE */
60
61
62 #if defined(USE_SPLITDOUBLE_2)
63 /* Splits double x into exponent e and mantissa m, where 1.0 <= abs(m) < 4.0.
64 Assumes that x is not zero, denormal, infinity or NaN, but these conditions
65 are not checked. Also assumes EXPBIAS_DP is odd. With this
66 assumption, e will be even on exit. */
splitDouble_2(double x,int * e,double * m)67 static inline void splitDouble_2(double x, int *e, double *m)
68 {
69 unsigned long long ux, vx;
70 GET_BITS_DP64(x, ux);
71 vx = ux;
72 ux &= EXPBITS_DP64;
73 ux >>= EXPSHIFTBITS_DP64;
74 if (ux & 1)
75 {
76 /* The exponent is odd */
77 vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | ONEEXPBITS_DP64;
78 PUT_BITS_DP64(vx, x);
79 *m = x;
80 *e = ux - EXPBIAS_DP64;
81 }
82 else
83 {
84 /* The exponent is even */
85 vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | TWOEXPBITS_DP64;
86 PUT_BITS_DP64(vx, x);
87 *m = x;
88 *e = ux - EXPBIAS_DP64 - 1;
89 }
90 }
91 #endif /* USE_SPLITDOUBLE_2 */
92
93
94 #if defined(USE_SPLITFLOAT)
95 /* Splits float x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0.
96 Assumes that x is not zero, denormal, infinity or NaN, but these conditions
97 are not checked */
splitFloat(float x,int * e,float * m)98 static inline void splitFloat(float x, int *e, float *m)
99 {
100 unsigned int ux, uy;
101 GET_BITS_SP32(x, ux);
102 uy = ux;
103 ux &= EXPBITS_SP32;
104 ux >>= EXPSHIFTBITS_SP32;
105 *e = (int)ux - EXPBIAS_SP32 + 1;
106 uy = (uy & (SIGNBIT_SP32 | MANTBITS_SP32)) | HALFEXPBITS_SP32;
107 PUT_BITS_SP32(uy, x);
108 *m = x;
109 }
110 #endif /* USE_SPLITFLOAT */
111
112
113 #if defined(USE_SCALEDOUBLE_1)
114 /* Scales the double x by 2.0**n.
115 Assumes EMIN <= n <= EMAX, though this condition is not checked. */
scaleDouble_1(double x,int n)116 static inline double scaleDouble_1(double x, int n)
117 {
118 double t;
119 /* Construct the number t = 2.0**n */
120 PUT_BITS_DP64(((long long)n + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t);
121 return x*t;
122 }
123 #endif /* USE_SCALEDOUBLE_1 */
124
125
126 #if defined(USE_SCALEDOUBLE_2)
127 /* Scales the double x by 2.0**n.
128 Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */
scaleDouble_2(double x,int n)129 static inline double scaleDouble_2(double x, int n)
130 {
131 double t1, t2;
132 int n1, n2;
133 n1 = n / 2;
134 n2 = n - n1;
135 /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */
136 PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1);
137 PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2);
138 return (x*t1)*t2;
139 }
140 #endif /* USE_SCALEDOUBLE_2 */
141
142
143 #if defined(USE_SCALEDOUBLE_3)
144 /* Scales the double x by 2.0**n.
145 Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */
scaleDouble_3(double x,int n)146 static inline double scaleDouble_3(double x, int n)
147 {
148 double t1, t2, t3;
149 int n1, n2, n3;
150 n1 = n / 3;
151 n2 = (n - n1) / 2;
152 n3 = n - n1 - n2;
153 /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */
154 PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1);
155 PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2);
156 PUT_BITS_DP64(((long long)n3 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t3);
157 return ((x*t1)*t2)*t3;
158 }
159 #endif /* USE_SCALEDOUBLE_3 */
160
161
162 #if defined(USE_SCALEFLOAT_1)
163 /* Scales the float x by 2.0**n.
164 Assumes EMIN <= n <= EMAX, though this condition is not checked. */
scaleFloat_1(float x,int n)165 static inline float scaleFloat_1(float x, int n)
166 {
167 float t;
168 /* Construct the number t = 2.0**n */
169 PUT_BITS_SP32((n + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t);
170 return x*t;
171 }
172 #endif /* USE_SCALEFLOAT_1 */
173
174
175 #if defined(USE_SCALEFLOAT_2)
176 /* Scales the float x by 2.0**n.
177 Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */
scaleFloat_2(float x,int n)178 static inline float scaleFloat_2(float x, int n)
179 {
180 float t1, t2;
181 int n1, n2;
182 n1 = n / 2;
183 n2 = n - n1;
184 /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */
185 PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1);
186 PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2);
187 return (x*t1)*t2;
188 }
189 #endif /* USE_SCALEFLOAT_2 */
190
191
192 #if defined(USE_SCALEFLOAT_3)
193 /* Scales the float x by 2.0**n.
194 Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */
scaleFloat_3(float x,int n)195 static inline float scaleFloat_3(float x, int n)
196 {
197 float t1, t2, t3;
198 int n1, n2, n3;
199 n1 = n / 3;
200 n2 = (n - n1) / 2;
201 n3 = n - n1 - n2;
202 /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */
203 PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1);
204 PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2);
205 PUT_BITS_SP32((n3 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t3);
206 return ((x*t1)*t2)*t3;
207 }
208 #endif /* USE_SCALEFLOAT_3 */
209
210 #if defined(USE_SETPRECISIONDOUBLE)
setPrecisionDouble(void)211 unsigned int setPrecisionDouble(void)
212 {
213 unsigned int cw, cwold = 0;
214 /* There is no precision control on Hammer */
215 return cwold;
216 }
217 #endif /* USE_SETPRECISIONDOUBLE */
218
219 #if defined(USE_RESTOREPRECISION)
restorePrecision(unsigned int cwold)220 void restorePrecision(unsigned int cwold)
221 {
222 /* There is no precision control on Hammer */
223 return;
224 }
225 #endif /* USE_RESTOREPRECISION */
226
227
228
229
230 #if defined(USE_RAISE_FPSW_FLAGS)
231 /* Raises floating-point status flags. The argument should be
232 the bitwise or of the flags to be raised, from the
233 list above, e.g.
234 raise_fpsw_flags(AMD_F_INEXACT | AMD_F_INVALID);
235 */
236
237 /* ISSUE - wat - 08182010
238 * These AMD_ISW_* flags are duplicated from trans.h
239 * this is not clean; Mark S. did it for targeted fix of 855457
240 * Eliminate all redundant flags in the next overhaul
241 */
242
243 #define AMD_ISW_INVALID 0x0001
244 #define AMD_ISW_DENORMAL 0x0002
245 #define AMD_ISW_ZERODIVIDE 0x0004
246 #define AMD_ISW_OVERFLOW 0x0008
247 #define AMD_ISW_UNDERFLOW 0x0010
248 #define AMD_ISW_INEXACT 0x0020
249
250 /* use this function from fpctrl.c */
251 void _set_statfp(uintptr_t);
252
raise_fpsw_flags(int flags)253 static inline void raise_fpsw_flags(int flags)
254 {
255 unsigned int f = 0;
256
257 if (flags & AMD_F_OVERFLOW) { f |= AMD_ISW_OVERFLOW; }
258 if (flags & AMD_F_UNDERFLOW) { f |= AMD_ISW_UNDERFLOW; }
259 if (flags & AMD_F_DIVBYZERO) { f |= AMD_ISW_ZERODIVIDE; }
260 if (flags & AMD_F_INVALID) { f |= AMD_ISW_INVALID; }
261 if (flags & AMD_F_INEXACT) { f |= AMD_ISW_INEXACT; }
262
263 _set_statfp(f);
264 }
265
266 #endif /* USE_RAISE_FPSW_FLAGS */
267
268
269 #if defined(USE_GET_FPSW_INLINE)
270 /* Return the current floating-point status word */
get_fpsw_inline(void)271 static inline unsigned int get_fpsw_inline(void)
272 {
273 return _mm_getcsr();
274 }
275 #endif /* USE_GET_FPSW_INLINE */
276
277 #if defined(USE_SET_FPSW_INLINE)
278 /* Set the floating-point status word */
set_fpsw_inline(unsigned int sw)279 static inline void set_fpsw_inline(unsigned int sw)
280 {
281 _mm_setcsr(sw);
282 }
283 #endif /* USE_SET_FPSW_INLINE */
284
285
286
287 #if defined(USE_VAL_WITH_FLAGS)
288 /* Returns a double value after raising the given flags,
289 e.g. val_with_flags(AMD_F_INEXACT);
290 */
val_with_flags(double val,int flags)291 static inline double val_with_flags(double val, int flags)
292 {
293 raise_fpsw_flags(flags);
294 return val;
295 }
296 #endif /* USE_VAL_WITH_FLAGS */
297
298 #if defined(USE_VALF_WITH_FLAGS)
299 /* Returns a float value after raising the given flags,
300 e.g. valf_with_flags(AMD_F_INEXACT);
301 */
valf_with_flags(float val,int flags)302 static inline float valf_with_flags(float val, int flags)
303 {
304 raise_fpsw_flags(flags);
305 return val;
306 }
307 #endif /* USE_VALF_WITH_FLAGS */
308
309
310 #if defined(USE_ZERO_WITH_FLAGS)
311 /* Returns a double +zero after raising the given flags,
312 e.g. zero_with_flags(AMD_F_INEXACT | AMD_F_INVALID);
313 */
zero_with_flags(int flags)314 static inline double zero_with_flags(int flags)
315 {
316 raise_fpsw_flags(flags);
317 return 0.0;
318 }
319 #endif /* USE_ZERO_WITH_FLAGS */
320
321
322 #if defined(USE_ZEROF_WITH_FLAGS)
323 /* Returns a float +zero after raising the given flags,
324 e.g. zerof_with_flags(AMD_F_INEXACT | AMD_F_INVALID);
325 */
zerof_with_flags(int flags)326 static inline float zerof_with_flags(int flags)
327 {
328 raise_fpsw_flags(flags);
329 return 0.0F;
330 }
331 #endif /* USE_ZEROF_WITH_FLAGS */
332
333
334 #if defined(USE_NAN_WITH_FLAGS)
335 /* Returns a double quiet +nan after raising the given flags,
336 e.g. nan_with_flags(AMD_F_INVALID);
337 */
nan_with_flags(int flags)338 static inline double nan_with_flags(int flags)
339 {
340 double z;
341 raise_fpsw_flags(flags);
342 PUT_BITS_DP64(0x7ff8000000000000, z);
343 return z;
344 }
345 #endif /* USE_NAN_WITH_FLAGS */
346
347 #if defined(USE_NANF_WITH_FLAGS)
348 /* Returns a float quiet +nan after raising the given flags,
349 e.g. nanf_with_flags(AMD_F_INVALID);
350 */
nanf_with_flags(int flags)351 static inline float nanf_with_flags(int flags)
352 {
353 float z;
354 raise_fpsw_flags(flags);
355 PUT_BITS_SP32(0x7fc00000, z);
356 return z;
357 }
358 #endif /* USE_NANF_WITH_FLAGS */
359
360
361 #if defined(USE_INDEFINITE_WITH_FLAGS)
362 /* Returns a double indefinite after raising the given flags,
363 e.g. indefinite_with_flags(AMD_F_INVALID);
364 */
indefinite_with_flags(int flags)365 static inline double indefinite_with_flags(int flags)
366 {
367 double z;
368 raise_fpsw_flags(flags);
369 PUT_BITS_DP64(0xfff8000000000000, z);
370 return z;
371 }
372 #endif /* USE_INDEFINITE_WITH_FLAGS */
373
374 #if defined(USE_INDEFINITEF_WITH_FLAGS)
375 /* Returns a float quiet +indefinite after raising the given flags,
376 e.g. indefinitef_with_flags(AMD_F_INVALID);
377 */
indefinitef_with_flags(int flags)378 static inline float indefinitef_with_flags(int flags)
379 {
380 float z;
381 raise_fpsw_flags(flags);
382 PUT_BITS_SP32(0xffc00000, z);
383 return z;
384 }
385 #endif /* USE_INDEFINITEF_WITH_FLAGS */
386
387
388 #ifdef USE_INFINITY_WITH_FLAGS
389 /* Returns a positive double infinity after raising the given flags,
390 e.g. infinity_with_flags(AMD_F_OVERFLOW);
391 */
infinity_with_flags(int flags)392 static inline double infinity_with_flags(int flags)
393 {
394 double z;
395 raise_fpsw_flags(flags);
396 PUT_BITS_DP64((unsigned long long)(BIASEDEMAX_DP64 + 1) << EXPSHIFTBITS_DP64, z);
397 return z;
398 }
399 #endif /* USE_INFINITY_WITH_FLAGS */
400
401 #ifdef USE_INFINITYF_WITH_FLAGS
402 /* Returns a positive float infinity after raising the given flags,
403 e.g. infinityf_with_flags(AMD_F_OVERFLOW);
404 */
infinityf_with_flags(int flags)405 static inline float infinityf_with_flags(int flags)
406 {
407 float z;
408 raise_fpsw_flags(flags);
409 PUT_BITS_SP32((BIASEDEMAX_SP32 + 1) << EXPSHIFTBITS_SP32, z);
410 return z;
411 }
412 #endif /* USE_INFINITYF_WITH_FLAGS */
413
414 #if defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF)
415 #include <errno.h>
416 #endif
417
418 /* define the Microsoft specific error handling routine */
419 double _handle_error(
420 char *fname,
421 int opcode,
422 unsigned long long value,
423 int type,
424 int flags,
425 int error,
426 double arg1,
427 double arg2,
428 int nargs
429 );
430 float _handle_errorf(
431 char *fname,
432 int opcode,
433 unsigned long long value,
434 int type,
435 int flags,
436 int error,
437 float arg1,
438 float arg2,
439 int nargs
440 );
441
442 #if defined(USE_SPLITEXP)
443 /* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2).
444 Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments
445 abs(x) > large/(ln(base)) (where large is the largest representable
446 floating point number) should be handled separately instead of calling
447 this function. This function is called by exp, exp2, exp10,
448 cosh and sinh. */
splitexp(double x,double logbase,double thirtytwo_by_logbaseof2,double logbaseof2_by_32_lead,double logbaseof2_by_32_trail,int * m,double * z1,double * z2)449 static inline void splitexp(double x, double logbase,
450 double thirtytwo_by_logbaseof2,
451 double logbaseof2_by_32_lead,
452 double logbaseof2_by_32_trail,
453 int *m, double *z1, double *z2)
454 {
455 double q, r, r1, r2, f1, f2;
456 int n, j;
457
458 /* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain
459 leading and trailing parts respectively of precomputed
460 values of pow(2.0,j/32.0), for j = 0, 1, ..., 31.
461 two_to_jby32_lead_table contains the first 25 bits of precision,
462 and two_to_jby32_trail_table contains a further 53 bits precision. */
463
464 static const double two_to_jby32_lead_table[32] = {
465 1.00000000000000000000e+00, /* 0x3ff0000000000000 */
466 1.02189713716506958008e+00, /* 0x3ff059b0d0000000 */
467 1.04427373409271240234e+00, /* 0x3ff0b55860000000 */
468 1.06714040040969848633e+00, /* 0x3ff11301d0000000 */
469 1.09050768613815307617e+00, /* 0x3ff172b830000000 */
470 1.11438673734664916992e+00, /* 0x3ff1d48730000000 */
471 1.13878858089447021484e+00, /* 0x3ff2387a60000000 */
472 1.16372483968734741211e+00, /* 0x3ff29e9df0000000 */
473 1.18920707702636718750e+00, /* 0x3ff306fe00000000 */
474 1.21524733304977416992e+00, /* 0x3ff371a730000000 */
475 1.24185776710510253906e+00, /* 0x3ff3dea640000000 */
476 1.26905095577239990234e+00, /* 0x3ff44e0860000000 */
477 1.29683953523635864258e+00, /* 0x3ff4bfdad0000000 */
478 1.32523661851882934570e+00, /* 0x3ff5342b50000000 */
479 1.35425549745559692383e+00, /* 0x3ff5ab07d0000000 */
480 1.38390988111495971680e+00, /* 0x3ff6247eb0000000 */
481 1.41421353816986083984e+00, /* 0x3ff6a09e60000000 */
482 1.44518077373504638672e+00, /* 0x3ff71f75e0000000 */
483 1.47682613134384155273e+00, /* 0x3ff7a11470000000 */
484 1.50916439294815063477e+00, /* 0x3ff8258990000000 */
485 1.54221081733703613281e+00, /* 0x3ff8ace540000000 */
486 1.57598084211349487305e+00, /* 0x3ff93737b0000000 */
487 1.61049032211303710938e+00, /* 0x3ff9c49180000000 */
488 1.64575546979904174805e+00, /* 0x3ffa5503b0000000 */
489 1.68179279565811157227e+00, /* 0x3ffae89f90000000 */
490 1.71861928701400756836e+00, /* 0x3ffb7f76f0000000 */
491 1.75625211000442504883e+00, /* 0x3ffc199bd0000000 */
492 1.79470902681350708008e+00, /* 0x3ffcb720d0000000 */
493 1.83400803804397583008e+00, /* 0x3ffd5818d0000000 */
494 1.87416762113571166992e+00, /* 0x3ffdfc9730000000 */
495 1.91520655155181884766e+00, /* 0x3ffea4afa0000000 */
496 1.95714408159255981445e+00}; /* 0x3fff507650000000 */
497
498 static const double two_to_jby32_trail_table[32] = {
499 0.00000000000000000000e+00, /* 0x0000000000000000 */
500 1.14890470981563546737e-08, /* 0x3e48ac2ba1d73e2a */
501 4.83347014379782142328e-08, /* 0x3e69f3121ec53172 */
502 2.67125131841396124714e-10, /* 0x3df25b50a4ebbf1b */
503 4.65271045830351350190e-08, /* 0x3e68faa2f5b9bef9 */
504 5.24924336638693782574e-09, /* 0x3e368b9aa7805b80 */
505 5.38622214388600821910e-08, /* 0x3e6ceac470cd83f6 */
506 1.90902301017041969782e-08, /* 0x3e547f7b84b09745 */
507 3.79763538792174980894e-08, /* 0x3e64636e2a5bd1ab */
508 2.69306947081946450986e-08, /* 0x3e5ceaa72a9c5154 */
509 4.49683815095311756138e-08, /* 0x3e682468446b6824 */
510 1.41933332021066904914e-09, /* 0x3e18624b40c4dbd0 */
511 1.94146510233556266402e-08, /* 0x3e54d8a89c750e5e */
512 2.46409119489264118569e-08, /* 0x3e5a753e077c2a0f */
513 4.94812958044698886494e-08, /* 0x3e6a90a852b19260 */
514 8.48872238075784476136e-10, /* 0x3e0d2ac258f87d03 */
515 2.42032342089579394887e-08, /* 0x3e59fcef32422cbf */
516 3.32420002333182569170e-08, /* 0x3e61d8bee7ba46e2 */
517 1.45956577586525322754e-08, /* 0x3e4f580c36bea881 */
518 3.46452721050003920866e-08, /* 0x3e62999c25159f11 */
519 8.07090469079979051284e-09, /* 0x3e415506dadd3e2a */
520 2.99439161340839520436e-09, /* 0x3e29b8bc9e8a0388 */
521 9.83621719880452147153e-09, /* 0x3e451f8480e3e236 */
522 8.35492309647188080486e-09, /* 0x3e41f12ae45a1224 */
523 3.48493175137966283582e-08, /* 0x3e62b5a75abd0e6a */
524 1.11084703472699692902e-08, /* 0x3e47daf237553d84 */
525 5.03688744342840346564e-08, /* 0x3e6b0aa538444196 */
526 4.81896001063495806249e-08, /* 0x3e69df20d22a0798 */
527 4.83653666334089557746e-08, /* 0x3e69f7490e4bb40b */
528 1.29745882314081237628e-08, /* 0x3e4bdcdaf5cb4656 */
529 9.84532844621636118964e-09, /* 0x3e452486cc2c7b9d */
530 4.25828404545651943883e-08}; /* 0x3e66dc8a80ce9f09 */
531
532 /*
533 Step 1. Reduce the argument.
534
535 To perform argument reduction, we find the integer n such that
536 x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64.
537 n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and
538 remainder by x - n*logbaseof2/32. The calculation of n is
539 straightforward whereas the computation of x - n*logbaseof2/32
540 must be carried out carefully.
541 logbaseof2/32 is so represented in two pieces that
542 (1) logbaseof2/32 is known to extra precision, (2) the product
543 of n and the leading piece is a model number and is hence
544 calculated without error, and (3) the subtraction of the value
545 obtained in (2) from x is a model number and is hence again
546 obtained without error.
547 */
548
549 r = x * thirtytwo_by_logbaseof2;
550 /* Set n = nearest integer to r */
551 /* This is faster on Hammer */
552 if (r > 0)
553 n = (int)(r + 0.5);
554 else
555 n = (int)(r - 0.5);
556
557 r1 = x - n * logbaseof2_by_32_lead;
558 r2 = - n * logbaseof2_by_32_trail;
559
560 /* Set j = n mod 32: 5 mod 32 = 5, -5 mod 32 = 27, etc. */
561 /* j = n % 32;
562 if (j < 0) j += 32; */
563 j = n & 0x0000001f;
564
565 f1 = two_to_jby32_lead_table[j];
566 f2 = two_to_jby32_trail_table[j];
567
568 *m = (n - j) / 32;
569
570 /* Step 2. The following is the core approximation. We approximate
571 exp(r1+r2)-1 by a polynomial. */
572
573 r1 *= logbase; r2 *= logbase;
574
575 r = r1 + r2;
576 q = r1 + (r2 +
577 r*r*( 5.00000000000000008883e-01 +
578 r*( 1.66666666665260878863e-01 +
579 r*( 4.16666666662260795726e-02 +
580 r*( 8.33336798434219616221e-03 +
581 r*( 1.38889490863777199667e-03 ))))));
582
583 /* Step 3. Function value reconstruction.
584 We now reconstruct the exponential of the input argument
585 so that exp(x) = 2**m * (z1 + z2).
586 The order of the computation below must be strictly observed. */
587
588 *z1 = f1;
589 *z2 = f2 + ((f1 + f2) * q);
590 }
591 #endif /* USE_SPLITEXP */
592
593
594 #if defined(USE_SPLITEXPF)
595 /* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2).
596 Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments
597 abs(x) > large/(ln(base)) (where large is the largest representable
598 floating point number) should be handled separately instead of calling
599 this function. This function is called by exp, exp2, exp10,
600 cosh and sinh. */
splitexpf(float x,float logbase,float thirtytwo_by_logbaseof2,float logbaseof2_by_32_lead,float logbaseof2_by_32_trail,int * m,float * z1,float * z2)601 static inline void splitexpf(float x, float logbase,
602 float thirtytwo_by_logbaseof2,
603 float logbaseof2_by_32_lead,
604 float logbaseof2_by_32_trail,
605 int *m, float *z1, float *z2)
606 {
607 float q, r, r1, r2, f1, f2;
608 int n, j;
609
610 /* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain
611 leading and trailing parts respectively of precomputed
612 values of pow(2.0,j/32.0), for j = 0, 1, ..., 31.
613 two_to_jby32_lead_table contains the first 10 bits of precision,
614 and two_to_jby32_trail_table contains a further 24 bits precision. */
615
616 static const float two_to_jby32_lead_table[32] = {
617 1.0000000000E+00F, /* 0x3F800000 */
618 1.0214843750E+00F, /* 0x3F82C000 */
619 1.0429687500E+00F, /* 0x3F858000 */
620 1.0664062500E+00F, /* 0x3F888000 */
621 1.0898437500E+00F, /* 0x3F8B8000 */
622 1.1132812500E+00F, /* 0x3F8E8000 */
623 1.1386718750E+00F, /* 0x3F91C000 */
624 1.1621093750E+00F, /* 0x3F94C000 */
625 1.1875000000E+00F, /* 0x3F980000 */
626 1.2148437500E+00F, /* 0x3F9B8000 */
627 1.2402343750E+00F, /* 0x3F9EC000 */
628 1.2675781250E+00F, /* 0x3FA24000 */
629 1.2949218750E+00F, /* 0x3FA5C000 */
630 1.3242187500E+00F, /* 0x3FA98000 */
631 1.3535156250E+00F, /* 0x3FAD4000 */
632 1.3828125000E+00F, /* 0x3FB10000 */
633 1.4140625000E+00F, /* 0x3FB50000 */
634 1.4433593750E+00F, /* 0x3FB8C000 */
635 1.4765625000E+00F, /* 0x3FBD0000 */
636 1.5078125000E+00F, /* 0x3FC10000 */
637 1.5410156250E+00F, /* 0x3FC54000 */
638 1.5742187500E+00F, /* 0x3FC98000 */
639 1.6093750000E+00F, /* 0x3FCE0000 */
640 1.6445312500E+00F, /* 0x3FD28000 */
641 1.6816406250E+00F, /* 0x3FD74000 */
642 1.7167968750E+00F, /* 0x3FDBC000 */
643 1.7558593750E+00F, /* 0x3FE0C000 */
644 1.7929687500E+00F, /* 0x3FE58000 */
645 1.8339843750E+00F, /* 0x3FEAC000 */
646 1.8730468750E+00F, /* 0x3FEFC000 */
647 1.9140625000E+00F, /* 0x3FF50000 */
648 1.9570312500E+00F}; /* 0x3FFA8000 */
649
650 static const float two_to_jby32_trail_table[32] = {
651 0.0000000000E+00F, /* 0x00000000 */
652 4.1277357377E-04F, /* 0x39D86988 */
653 1.3050324051E-03F, /* 0x3AAB0D9F */
654 7.3415064253E-04F, /* 0x3A407404 */
655 6.6398258787E-04F, /* 0x3A2E0F1E */
656 1.1054925853E-03F, /* 0x3A90E62D */
657 1.1675967835E-04F, /* 0x38F4DCE0 */
658 1.6154836630E-03F, /* 0x3AD3BEA3 */
659 1.7071149778E-03F, /* 0x3ADFC146 */
660 4.0360994171E-04F, /* 0x39D39B9C */
661 1.6234370414E-03F, /* 0x3AD4C982 */
662 1.4728321694E-03F, /* 0x3AC10C0C */
663 1.9176795613E-03F, /* 0x3AFB5AA6 */
664 1.0178930825E-03F, /* 0x3A856AD3 */
665 7.3992193211E-04F, /* 0x3A41F752 */
666 1.0973819299E-03F, /* 0x3A8FD607 */
667 1.5106226783E-04F, /* 0x391E6678 */
668 1.8214319134E-03F, /* 0x3AEEBD1D */
669 2.6364589576E-04F, /* 0x398A39F4 */
670 1.3519275235E-03F, /* 0x3AB13329 */
671 1.1952003697E-03F, /* 0x3A9CA845 */
672 1.7620950239E-03F, /* 0x3AE6F619 */
673 1.1153318919E-03F, /* 0x3A923054 */
674 1.2242280645E-03F, /* 0x3AA07647 */
675 1.5220546629E-04F, /* 0x391F9958 */
676 1.8224230735E-03F, /* 0x3AEEDE5F */
677 3.9278529584E-04F, /* 0x39CDEEC0 */
678 1.7403248930E-03F, /* 0x3AE41B9D */
679 2.3711356334E-05F, /* 0x37C6E7C0 */
680 1.1207590578E-03F, /* 0x3A92E66F */
681 1.1440613307E-03F, /* 0x3A95F454 */
682 1.1287408415E-04F}; /* 0x38ECB6D0 */
683
684 /*
685 Step 1. Reduce the argument.
686
687 To perform argument reduction, we find the integer n such that
688 x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64.
689 n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and
690 remainder by x - n*logbaseof2/32. The calculation of n is
691 straightforward whereas the computation of x - n*logbaseof2/32
692 must be carried out carefully.
693 logbaseof2/32 is so represented in two pieces that
694 (1) logbaseof2/32 is known to extra precision, (2) the product
695 of n and the leading piece is a model number and is hence
696 calculated without error, and (3) the subtraction of the value
697 obtained in (2) from x is a model number and is hence again
698 obtained without error.
699 */
700
701 r = x * thirtytwo_by_logbaseof2;
702 /* Set n = nearest integer to r */
703 /* This is faster on Hammer */
704 if (r > 0)
705 n = (int)(r + 0.5F);
706 else
707 n = (int)(r - 0.5F);
708
709 r1 = x - n * logbaseof2_by_32_lead;
710 r2 = - n * logbaseof2_by_32_trail;
711
712 /* Set j = n mod 32: 5 mod 32 = 5, -5 mod 32 = 27, etc. */
713 /* j = n % 32;
714 if (j < 0) j += 32; */
715 j = n & 0x0000001f;
716
717 f1 = two_to_jby32_lead_table[j];
718 f2 = two_to_jby32_trail_table[j];
719
720 *m = (n - j) / 32;
721
722 /* Step 2. The following is the core approximation. We approximate
723 exp(r1+r2)-1 by a polynomial. */
724
725 r1 *= logbase; r2 *= logbase;
726
727 r = r1 + r2;
728 q = r1 + (r2 +
729 r*r*( 5.00000000000000008883e-01F +
730 r*( 1.66666666665260878863e-01F )));
731
732 /* Step 3. Function value reconstruction.
733 We now reconstruct the exponential of the input argument
734 so that exp(x) = 2**m * (z1 + z2).
735 The order of the computation below must be strictly observed. */
736
737 *z1 = f1;
738 *z2 = f2 + ((f1 + f2) * q);
739 }
740 #endif /* SPLITEXPF */
741
742
743 #if defined(USE_SCALEUPDOUBLE1024)
744 /* Scales up a double (normal or denormal) whose bit pattern is given
745 as ux by 2**1024. There are no checks that the input number is
746 scalable by that amount. */
scaleUpDouble1024(unsigned long long ux,unsigned long long * ur)747 static inline void scaleUpDouble1024(unsigned long long ux, unsigned long long *ur)
748 {
749 unsigned long long uy;
750 double y;
751
752 if ((ux & EXPBITS_DP64) == 0)
753 {
754 /* ux is denormalised */
755 PUT_BITS_DP64(ux | 0x4010000000000000, y);
756 if (ux & SIGNBIT_DP64)
757 y += 4.0;
758 else
759 y -= 4.0;
760 GET_BITS_DP64(y, uy);
761 }
762 else
763 /* ux is normal */
764 uy = ux + 0x4000000000000000;
765
766 *ur = uy;
767 return;
768 }
769
770 #endif /* SCALEUPDOUBLE1024 */
771
772
773 #if defined(USE_SCALEDOWNDOUBLE)
774 /* Scales down a double whose bit pattern is given as ux by 2**k.
775 There are no checks that the input number is scalable by that amount. */
scaleDownDouble(unsigned long long ux,int k,unsigned long long * ur)776 static inline void scaleDownDouble(unsigned long long ux, int k,
777 unsigned long long *ur)
778 {
779 unsigned long long uy, uk, ax, xsign;
780 int n, shift;
781 xsign = ux & SIGNBIT_DP64;
782 ax = ux & ~SIGNBIT_DP64;
783 n = (int)((ax & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - k;
784 if (n > 0)
785 {
786 uk = (unsigned long long)n << EXPSHIFTBITS_DP64;
787 uy = (ax & ~EXPBITS_DP64) | uk;
788 }
789 else
790 {
791 uy = (ax & ~EXPBITS_DP64) | 0x0010000000000000;
792 shift = (1 - n);
793 if (shift > MANTLENGTH_DP64 + 1)
794 /* Sigh. Shifting works mod 64 so be careful not to shift too much */
795 uy = 0;
796 else
797 {
798 /* Make sure we round the result */
799 uy >>= shift - 1;
800 uy = (uy >> 1) + (uy & 1);
801 }
802 }
803 *ur = uy | xsign;
804 }
805
806 #endif /* SCALEDOWNDOUBLE */
807
808
809 #if defined(USE_SCALEUPFLOAT128)
810 /* Scales up a float (normal or denormal) whose bit pattern is given
811 as ux by 2**128. There are no checks that the input number is
812 scalable by that amount. */
scaleUpFloat128(unsigned int ux,unsigned int * ur)813 static inline void scaleUpFloat128(unsigned int ux, unsigned int *ur)
814 {
815 unsigned int uy;
816 float y;
817
818 if ((ux & EXPBITS_SP32) == 0)
819 {
820 /* ux is denormalised */
821 PUT_BITS_SP32(ux | 0x40800000, y);
822 /* Compensate for the implicit bit just added */
823 if (ux & SIGNBIT_SP32)
824 y += 4.0F;
825 else
826 y -= 4.0F;
827 GET_BITS_SP32(y, uy);
828 }
829 else
830 /* ux is normal */
831 uy = ux + 0x40000000;
832 *ur = uy;
833 }
834 #endif /* SCALEUPFLOAT128 */
835
836
837 #if defined(USE_SCALEDOWNFLOAT)
838 /* Scales down a float whose bit pattern is given as ux by 2**k.
839 There are no checks that the input number is scalable by that amount. */
scaleDownFloat(unsigned int ux,int k,unsigned int * ur)840 static inline void scaleDownFloat(unsigned int ux, int k,
841 unsigned int *ur)
842 {
843 unsigned int uy, uk, ax, xsign;
844 int n, shift;
845
846 xsign = ux & SIGNBIT_SP32;
847 ax = ux & ~SIGNBIT_SP32;
848 n = ((ax & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - k;
849 if (n > 0)
850 {
851 uk = (unsigned int)n << EXPSHIFTBITS_SP32;
852 uy = (ax & ~EXPBITS_SP32) | uk;
853 }
854 else
855 {
856 uy = (ax & ~EXPBITS_SP32) | 0x00800000;
857 shift = (1 - n);
858 if (shift > MANTLENGTH_SP32 + 1)
859 /* Sigh. Shifting works mod 32 so be careful not to shift too much */
860 uy = 0;
861 else
862 {
863 /* Make sure we round the result */
864 uy >>= shift - 1;
865 uy = (uy >> 1) + (uy & 1);
866 }
867 }
868 *ur = uy | xsign;
869 }
870 #endif /* SCALEDOWNFLOAT */
871
872
873 #if defined(USE_SQRT_AMD_INLINE)
sqrt_amd_inline(double x)874 static inline double sqrt_amd_inline(double x)
875 {
876 /*
877 Computes the square root of x.
878
879 The calculation is carried out in three steps.
880
881 Step 1. Reduction.
882 The input argument is scaled to the interval [1, 4) by
883 computing
884 x = 2^e * y, where y in [1,4).
885 Furthermore y is decomposed as y = c + t where
886 c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64.
887
888 Step 2. Approximation.
889 An approximation q = sqrt(1 + (t/c)) - 1 is obtained
890 from a basic series expansion using precomputed values
891 stored in rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl.
892
893 Step 3. Reconstruction.
894 The value of sqrt(x) is reconstructed via
895 sqrt(x) = 2^(e/2) * sqrt(y)
896 = 2^(e/2) * sqrt(c) * sqrt(y/c)
897 = 2^(e/2) * sqrt(c) * sqrt(1 + t/c)
898 = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ]
899 */
900
901 unsigned long long ux, ax, u;
902 double r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail;
903 int e, denorm = 0, index;
904
905 /* Arrays rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl contain
906 leading and trailing parts respectively of precomputed
907 values of sqrt(j/32), for j = 32, 33, ..., 128.
908 rt_jby32_lead_table_dbl contains the first 21 bits of precision,
909 and rt_jby32_trail_table_dbl contains a further 53 bits precision. */
910
911 static const double rt_jby32_lead_table_dbl[97] = {
912 1.00000000000000000000e+00, /* 0x3ff0000000000000 */
913 1.01550388336181640625e+00, /* 0x3ff03f8100000000 */
914 1.03077602386474609375e+00, /* 0x3ff07e0f00000000 */
915 1.04582500457763671875e+00, /* 0x3ff0bbb300000000 */
916 1.06065940856933593750e+00, /* 0x3ff0f87600000000 */
917 1.07528972625732421875e+00, /* 0x3ff1346300000000 */
918 1.08972454071044921875e+00, /* 0x3ff16f8300000000 */
919 1.10396957397460937500e+00, /* 0x3ff1a9dc00000000 */
920 1.11803340911865234375e+00, /* 0x3ff1e37700000000 */
921 1.13192272186279296875e+00, /* 0x3ff21c5b00000000 */
922 1.14564323425292968750e+00, /* 0x3ff2548e00000000 */
923 1.15920162200927734375e+00, /* 0x3ff28c1700000000 */
924 1.17260360717773437500e+00, /* 0x3ff2c2fc00000000 */
925 1.18585395812988281250e+00, /* 0x3ff2f94200000000 */
926 1.19895744323730468750e+00, /* 0x3ff32eee00000000 */
927 1.21191978454589843750e+00, /* 0x3ff3640600000000 */
928 1.22474479675292968750e+00, /* 0x3ff3988e00000000 */
929 1.23743629455566406250e+00, /* 0x3ff3cc8a00000000 */
930 1.25000000000000000000e+00, /* 0x3ff4000000000000 */
931 1.26243782043457031250e+00, /* 0x3ff432f200000000 */
932 1.27475452423095703125e+00, /* 0x3ff4656500000000 */
933 1.28695297241210937500e+00, /* 0x3ff4975c00000000 */
934 1.29903793334960937500e+00, /* 0x3ff4c8dc00000000 */
935 1.31101036071777343750e+00, /* 0x3ff4f9e600000000 */
936 1.32287502288818359375e+00, /* 0x3ff52a7f00000000 */
937 1.33463478088378906250e+00, /* 0x3ff55aaa00000000 */
938 1.34629058837890625000e+00, /* 0x3ff58a6800000000 */
939 1.35784721374511718750e+00, /* 0x3ff5b9be00000000 */
940 1.36930561065673828125e+00, /* 0x3ff5e8ad00000000 */
941 1.38066959381103515625e+00, /* 0x3ff6173900000000 */
942 1.39194107055664062500e+00, /* 0x3ff6456400000000 */
943 1.40312099456787109375e+00, /* 0x3ff6732f00000000 */
944 1.41421318054199218750e+00, /* 0x3ff6a09e00000000 */
945 1.42521858215332031250e+00, /* 0x3ff6cdb200000000 */
946 1.43614006042480468750e+00, /* 0x3ff6fa6e00000000 */
947 1.44697952270507812500e+00, /* 0x3ff726d400000000 */
948 1.45773792266845703125e+00, /* 0x3ff752e500000000 */
949 1.46841716766357421875e+00, /* 0x3ff77ea300000000 */
950 1.47901916503906250000e+00, /* 0x3ff7aa1000000000 */
951 1.48954677581787109375e+00, /* 0x3ff7d52f00000000 */
952 1.50000000000000000000e+00, /* 0x3ff8000000000000 */
953 1.51038074493408203125e+00, /* 0x3ff82a8500000000 */
954 1.52068996429443359375e+00, /* 0x3ff854bf00000000 */
955 1.53093051910400390625e+00, /* 0x3ff87eb100000000 */
956 1.54110336303710937500e+00, /* 0x3ff8a85c00000000 */
957 1.55120849609375000000e+00, /* 0x3ff8d1c000000000 */
958 1.56124877929687500000e+00, /* 0x3ff8fae000000000 */
959 1.57122516632080078125e+00, /* 0x3ff923bd00000000 */
960 1.58113861083984375000e+00, /* 0x3ff94c5800000000 */
961 1.59099006652832031250e+00, /* 0x3ff974b200000000 */
962 1.60078048706054687500e+00, /* 0x3ff99ccc00000000 */
963 1.61051177978515625000e+00, /* 0x3ff9c4a800000000 */
964 1.62018489837646484375e+00, /* 0x3ff9ec4700000000 */
965 1.62979984283447265625e+00, /* 0x3ffa13a900000000 */
966 1.63935947418212890625e+00, /* 0x3ffa3ad100000000 */
967 1.64886283874511718750e+00, /* 0x3ffa61be00000000 */
968 1.65831184387207031250e+00, /* 0x3ffa887200000000 */
969 1.66770744323730468750e+00, /* 0x3ffaaeee00000000 */
970 1.67705059051513671875e+00, /* 0x3ffad53300000000 */
971 1.68634128570556640625e+00, /* 0x3ffafb4100000000 */
972 1.69558238983154296875e+00, /* 0x3ffb211b00000000 */
973 1.70477199554443359375e+00, /* 0x3ffb46bf00000000 */
974 1.71391296386718750000e+00, /* 0x3ffb6c3000000000 */
975 1.72300529479980468750e+00, /* 0x3ffb916e00000000 */
976 1.73204994201660156250e+00, /* 0x3ffbb67a00000000 */
977 1.74104785919189453125e+00, /* 0x3ffbdb5500000000 */
978 1.75000000000000000000e+00, /* 0x3ffc000000000000 */
979 1.75890541076660156250e+00, /* 0x3ffc247a00000000 */
980 1.76776695251464843750e+00, /* 0x3ffc48c600000000 */
981 1.77658367156982421875e+00, /* 0x3ffc6ce300000000 */
982 1.78535652160644531250e+00, /* 0x3ffc90d200000000 */
983 1.79408740997314453125e+00, /* 0x3ffcb49500000000 */
984 1.80277538299560546875e+00, /* 0x3ffcd82b00000000 */
985 1.81142139434814453125e+00, /* 0x3ffcfb9500000000 */
986 1.82002735137939453125e+00, /* 0x3ffd1ed500000000 */
987 1.82859230041503906250e+00, /* 0x3ffd41ea00000000 */
988 1.83711719512939453125e+00, /* 0x3ffd64d500000000 */
989 1.84560203552246093750e+00, /* 0x3ffd879600000000 */
990 1.85404872894287109375e+00, /* 0x3ffdaa2f00000000 */
991 1.86245727539062500000e+00, /* 0x3ffdcca000000000 */
992 1.87082862854003906250e+00, /* 0x3ffdeeea00000000 */
993 1.87916183471679687500e+00, /* 0x3ffe110c00000000 */
994 1.88745784759521484375e+00, /* 0x3ffe330700000000 */
995 1.89571857452392578125e+00, /* 0x3ffe54dd00000000 */
996 1.90394306182861328125e+00, /* 0x3ffe768d00000000 */
997 1.91213226318359375000e+00, /* 0x3ffe981800000000 */
998 1.92028617858886718750e+00, /* 0x3ffeb97e00000000 */
999 1.92840576171875000000e+00, /* 0x3ffedac000000000 */
1000 1.93649101257324218750e+00, /* 0x3ffefbde00000000 */
1001 1.94454288482666015625e+00, /* 0x3fff1cd900000000 */
1002 1.95256233215332031250e+00, /* 0x3fff3db200000000 */
1003 1.96054744720458984375e+00, /* 0x3fff5e6700000000 */
1004 1.96850109100341796875e+00, /* 0x3fff7efb00000000 */
1005 1.97642326354980468750e+00, /* 0x3fff9f6e00000000 */
1006 1.98431301116943359375e+00, /* 0x3fffbfbf00000000 */
1007 1.99217128753662109375e+00, /* 0x3fffdfef00000000 */
1008 2.00000000000000000000e+00}; /* 0x4000000000000000 */
1009
1010 static const double rt_jby32_trail_table_dbl[97] = {
1011 0.00000000000000000000e+00, /* 0x0000000000000000 */
1012 9.17217678638807524014e-07, /* 0x3eaec6d70177881c */
1013 3.82539669043705364790e-07, /* 0x3e99abfb41bd6b24 */
1014 2.85899577162227138140e-08, /* 0x3e5eb2bf6bab55a2 */
1015 7.63210485349101216659e-07, /* 0x3ea99bed9b2d8d0c */
1016 9.32123004127716212874e-07, /* 0x3eaf46e029c1b296 */
1017 1.95174719169309219157e-07, /* 0x3e8a3226fc42f30c */
1018 5.34316371481845492427e-07, /* 0x3ea1edbe20701d73 */
1019 5.79631242504454563052e-07, /* 0x3ea372fe94f82be7 */
1020 4.20404384109571705948e-07, /* 0x3e9c367e08e7bb06 */
1021 6.89486030314147010716e-07, /* 0x3ea722a3d0a66608 */
1022 6.89927685625314560328e-07, /* 0x3ea7266f067ca1d6 */
1023 3.32778123013641425828e-07, /* 0x3e965515a9b34850 */
1024 1.64433259436999584387e-07, /* 0x3e8611e23ef6c1bd */
1025 4.37590875197899335723e-07, /* 0x3e9d5dc1059ed8e7 */
1026 1.79808183816018617413e-07, /* 0x3e88222982d0e4f4 */
1027 7.46386593615986477624e-08, /* 0x3e7409212e7d0322 */
1028 5.72520794105201454728e-07, /* 0x3ea335ea8a5fcf39 */
1029 0.00000000000000000000e+00, /* 0x0000000000000000 */
1030 2.96860689431670420344e-07, /* 0x3e93ec071e938bfe */
1031 3.54167239176257065345e-07, /* 0x3e97c48bfd9862c6 */
1032 7.95211265664474710063e-07, /* 0x3eaaaed010f74671 */
1033 1.72327048595145565621e-07, /* 0x3e87211cbfeb62e0 */
1034 6.99494915996239297020e-07, /* 0x3ea7789d9660e72d */
1035 6.32644111701500844315e-07, /* 0x3ea53a5f1d36f1cf */
1036 6.20124838851440463844e-10, /* 0x3e054eacff2057dc */
1037 6.13404719757812629969e-07, /* 0x3ea4951b3e6a83cc */
1038 3.47654909777986407387e-07, /* 0x3e9754aa76884c66 */
1039 7.83106177002392475763e-07, /* 0x3eaa46d4b1de1074 */
1040 5.33337372440526357008e-07, /* 0x3ea1e55548f92635 */
1041 2.01508648555298681765e-08, /* 0x3e55a3070dd17788 */
1042 5.25472356925843939587e-07, /* 0x3ea1a1c5eedb0801 */
1043 3.81831102861301692797e-07, /* 0x3e999fcef32422cc */
1044 6.99220602161420018738e-07, /* 0x3ea776425d6b0199 */
1045 6.01209702477462624811e-07, /* 0x3ea42c5a1e0191a2 */
1046 9.01437000591944740554e-08, /* 0x3e7832a0bdff1327 */
1047 5.10428680864685379950e-08, /* 0x3e6b674743636676 */
1048 3.47895267104621031421e-07, /* 0x3e9758cb90d2f714 */
1049 7.80735841510641848628e-07, /* 0x3eaa3278459cde25 */
1050 1.35158752025506517690e-07, /* 0x3e822404f4a103ee */
1051 0.00000000000000000000e+00, /* 0x0000000000000000 */
1052 1.76523947728535489812e-09, /* 0x3e1e539af6892ac5 */
1053 6.68280121328499932183e-07, /* 0x3ea66c7b872c9cd0 */
1054 5.70135482405123276616e-07, /* 0x3ea3216d2f43887d */
1055 1.37705134737562525897e-07, /* 0x3e827b832cbedc0e */
1056 7.09655107074516613672e-07, /* 0x3ea7cfe41579091d */
1057 7.20302724551461693011e-07, /* 0x3ea82b5a713c490a */
1058 4.69926266058212796694e-07, /* 0x3e9f8945932d872e */
1059 2.19244345915999437026e-07, /* 0x3e8d6d2da9490251 */
1060 1.91141411617401877927e-07, /* 0x3e89a791a3114e4a */
1061 5.72297665296622053774e-07, /* 0x3ea333ffe005988d */
1062 5.61055484436830560103e-07, /* 0x3ea2d36e0ed49ab1 */
1063 2.76225500213991506100e-07, /* 0x3e92898498f55f9e */
1064 7.58466189522395692908e-07, /* 0x3ea9732cca1032a3 */
1065 1.56893371256836029827e-07, /* 0x3e850ed0b02a22d2 */
1066 4.06038997708867066507e-07, /* 0x3e9b3fb265b1e40a */
1067 5.51305629612057435809e-07, /* 0x3ea27fade682d1de */
1068 5.64778487026561123207e-07, /* 0x3ea2f36906f707ba */
1069 3.92609705553556897517e-07, /* 0x3e9a58fbbee883b6 */
1070 9.09698438776943827802e-07, /* 0x3eae864005bca6d7 */
1071 1.05949774066016139743e-07, /* 0x3e7c70d02300f263 */
1072 7.16578798392844784244e-07, /* 0x3ea80b5d712d8e3e */
1073 6.86233073531233972561e-07, /* 0x3ea706b27cc7d390 */
1074 7.99211473033494452908e-07, /* 0x3eaad12c9d849a97 */
1075 8.65552275731027456121e-07, /* 0x3ead0b09954e764b */
1076 6.75456120386058448618e-07, /* 0x3ea6aa1fb7826cbd */
1077 0.00000000000000000000e+00, /* 0x0000000000000000 */
1078 4.99167184520462138743e-07, /* 0x3ea0bfd03f46763c */
1079 4.51720373502110930296e-10, /* 0x3dff0abfb4adfb9e */
1080 1.28874162718371367439e-07, /* 0x3e814c151f991b2e */
1081 5.85529267186999798656e-07, /* 0x3ea3a5a879b09292 */
1082 1.01827770937125531924e-07, /* 0x3e7b558d173f9796 */
1083 2.54736389177809626508e-07, /* 0x3e9118567cd83fb8 */
1084 6.98925535290464831294e-07, /* 0x3ea773b981896751 */
1085 1.20940735036524314513e-07, /* 0x3e803b7df49f48a8 */
1086 5.43759351196479689657e-08, /* 0x3e6d315f22491900 */
1087 1.11957989042397958409e-07, /* 0x3e7e0db1c5bb84b2 */
1088 8.47006714134442661218e-07, /* 0x3eac6bbb7644ff76 */
1089 8.92831044643427836228e-07, /* 0x3eadf55c3afec01f */
1090 7.77828292464916501663e-07, /* 0x3eaa197e81034da3 */
1091 6.48469316302918797451e-08, /* 0x3e71683f4920555d */
1092 2.12579816658859849140e-07, /* 0x3e8c882fd78bb0b0 */
1093 7.61222472580559138435e-07, /* 0x3ea98ad9eb7b83ec */
1094 2.86488961857314189607e-07, /* 0x3e9339d7c7777273 */
1095 2.14637363790165363515e-07, /* 0x3e8ccee237cae6fe */
1096 5.44137005612605847831e-08, /* 0x3e6d368fe324a146 */
1097 2.58378284856442408413e-07, /* 0x3e9156e7b6d99b45 */
1098 3.15848939061134843091e-07, /* 0x3e95323e5310b5c1 */
1099 6.60530466255089632309e-07, /* 0x3ea629e9db362f5d */
1100 7.63436345535852301127e-07, /* 0x3ea99dde4728d7ec */
1101 8.68233432860324345268e-08, /* 0x3e774e746878544d */
1102 9.45465175398023087082e-07, /* 0x3eafb97be873a87d */
1103 8.77499534786171267246e-07, /* 0x3ead71a9e23c2f63 */
1104 2.74055432394999316135e-07, /* 0x3e92643c89cda173 */
1105 4.72129009349126213532e-07, /* 0x3e9faf1d57a4d56c */
1106 8.93777032327078947306e-07, /* 0x3eadfd7c7ab7b282 */
1107 0.00000000000000000000e+00}; /* 0x0000000000000000 */
1108
1109
1110 /* Handle special arguments first */
1111
1112 GET_BITS_DP64(x, ux);
1113 ax = ux & (~SIGNBIT_DP64);
1114
1115 if(ax >= 0x7ff0000000000000)
1116 {
1117 /* x is either NaN or infinity */
1118 if (ux & MANTBITS_DP64)
1119 /* x is NaN */
1120 return x + x; /* Raise invalid if it is a signalling NaN */
1121 else if (ux & SIGNBIT_DP64)
1122 /* x is negative infinity */
1123 return nan_with_flags(AMD_F_INVALID);
1124 else
1125 /* x is positive infinity */
1126 return x;
1127 }
1128 else if (ux & SIGNBIT_DP64)
1129 {
1130 /* x is negative. */
1131 if (ux == SIGNBIT_DP64)
1132 /* Handle negative zero first */
1133 return x;
1134 else
1135 return nan_with_flags(AMD_F_INVALID);
1136 }
1137 else if (ux <= 0x000fffffffffffff)
1138 {
1139 /* x is denormalised or zero */
1140 if (ux == 0)
1141 /* x is zero */
1142 return x;
1143 else
1144 {
1145 /* x is denormalised; scale it up */
1146 /* Normalize x by increasing the exponent by 60
1147 and subtracting a correction to account for the implicit
1148 bit. This replaces a slow denormalized
1149 multiplication by a fast normal subtraction. */
1150 static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */
1151 denorm = 1;
1152 GET_BITS_DP64(x, ux);
1153 PUT_BITS_DP64(ux | 0x03d0000000000000, x);
1154 x -= corr;
1155 GET_BITS_DP64(x, ux);
1156 }
1157 }
1158
1159 /* Main algorithm */
1160
1161 /*
1162 Find y and e such that x = 2^e * y, where y in [1,4).
1163 This is done using an in-lined variant of splitDouble,
1164 which also ensures that e is even.
1165 */
1166 y = x;
1167 ux &= EXPBITS_DP64;
1168 ux >>= EXPSHIFTBITS_DP64;
1169 if (ux & 1)
1170 {
1171 GET_BITS_DP64(y, u);
1172 u &= (SIGNBIT_DP64 | MANTBITS_DP64);
1173 u |= ONEEXPBITS_DP64;
1174 PUT_BITS_DP64(u, y);
1175 e = ux - EXPBIAS_DP64;
1176 }
1177 else
1178 {
1179 GET_BITS_DP64(y, u);
1180 u &= (SIGNBIT_DP64 | MANTBITS_DP64);
1181 u |= TWOEXPBITS_DP64;
1182 PUT_BITS_DP64(u, y);
1183 e = ux - EXPBIAS_DP64 - 1;
1184 }
1185
1186
1187 /* Find the index of the sub-interval of [1,4) in which y lies. */
1188
1189 index = (int)(32.0*y+0.5);
1190
1191 /* Look up the table values and compute c and r = c/t */
1192
1193 rtc_lead = rt_jby32_lead_table_dbl[index-32];
1194 rtc_trail = rt_jby32_trail_table_dbl[index-32];
1195 c = 0.03125*index;
1196 r = (y - c)/c;
1197
1198 /*
1199 Find q = sqrt(1+r) - 1.
1200 From one step of Newton on (q+1)^2 = 1+r
1201 */
1202
1203 p = r*0.5 - r*r*(0.1250079870 - r*(0.6250522999E-01));
1204 twop = p + p;
1205 q = p - (p*p + (twop - r))/(twop + 2.0);
1206
1207 /* Reconstruction */
1208
1209 rtc = rtc_lead + rtc_trail;
1210 e >>= 1; /* e = e/2 */
1211 z = rtc_lead + (rtc*q+rtc_trail);
1212
1213 if (denorm)
1214 {
1215 /* Scale by 2**(e-30) */
1216 PUT_BITS_DP64(((long long)(e - 30) + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r);
1217 z *= r;
1218 }
1219 else
1220 {
1221 /* Scale by 2**e */
1222 PUT_BITS_DP64(((long long)e + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r);
1223 z *= r;
1224 }
1225
1226 return z;
1227
1228 }
1229 #endif /* SQRT_AMD_INLINE */
1230
1231 #if defined(USE_SQRTF_AMD_INLINE)
1232
sqrtf_amd_inline(float x)1233 static inline float sqrtf_amd_inline(float x)
1234 {
1235 /*
1236 Computes the square root of x.
1237
1238 The calculation is carried out in three steps.
1239
1240 Step 1. Reduction.
1241 The input argument is scaled to the interval [1, 4) by
1242 computing
1243 x = 2^e * y, where y in [1,4).
1244 Furthermore y is decomposed as y = c + t where
1245 c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64.
1246
1247 Step 2. Approximation.
1248 An approximation q = sqrt(1 + (t/c)) - 1 is obtained
1249 from a basic series expansion using precomputed values
1250 stored in rt_jby32_lead_table_float and rt_jby32_trail_table_float.
1251
1252 Step 3. Reconstruction.
1253 The value of sqrt(x) is reconstructed via
1254 sqrt(x) = 2^(e/2) * sqrt(y)
1255 = 2^(e/2) * sqrt(c) * sqrt(y/c)
1256 = 2^(e/2) * sqrt(c) * sqrt(1 + t/c)
1257 = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ]
1258 */
1259
1260 unsigned int ux, ax, u;
1261 float r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail;
1262 int e, denorm = 0, index;
1263
1264 /* Arrays rt_jby32_lead_table_float and rt_jby32_trail_table_float contain
1265 leading and trailing parts respectively of precomputed
1266 values of sqrt(j/32), for j = 32, 33, ..., 128.
1267 rt_jby32_lead_table_float contains the first 13 bits of precision,
1268 and rt_jby32_trail_table_float contains a further 24 bits precision. */
1269
1270 static const float rt_jby32_lead_table_float[97] = {
1271 1.00000000000000000000e+00F, /* 0x3f800000 */
1272 1.01538085937500000000e+00F, /* 0x3f81f800 */
1273 1.03076171875000000000e+00F, /* 0x3f83f000 */
1274 1.04565429687500000000e+00F, /* 0x3f85d800 */
1275 1.06054687500000000000e+00F, /* 0x3f87c000 */
1276 1.07519531250000000000e+00F, /* 0x3f89a000 */
1277 1.08959960937500000000e+00F, /* 0x3f8b7800 */
1278 1.10375976562500000000e+00F, /* 0x3f8d4800 */
1279 1.11791992187500000000e+00F, /* 0x3f8f1800 */
1280 1.13183593750000000000e+00F, /* 0x3f90e000 */
1281 1.14550781250000000000e+00F, /* 0x3f92a000 */
1282 1.15917968750000000000e+00F, /* 0x3f946000 */
1283 1.17236328125000000000e+00F, /* 0x3f961000 */
1284 1.18579101562500000000e+00F, /* 0x3f97c800 */
1285 1.19873046875000000000e+00F, /* 0x3f997000 */
1286 1.21191406250000000000e+00F, /* 0x3f9b2000 */
1287 1.22460937500000000000e+00F, /* 0x3f9cc000 */
1288 1.23730468750000000000e+00F, /* 0x3f9e6000 */
1289 1.25000000000000000000e+00F, /* 0x3fa00000 */
1290 1.26220703125000000000e+00F, /* 0x3fa19000 */
1291 1.27465820312500000000e+00F, /* 0x3fa32800 */
1292 1.28686523437500000000e+00F, /* 0x3fa4b800 */
1293 1.29882812500000000000e+00F, /* 0x3fa64000 */
1294 1.31079101562500000000e+00F, /* 0x3fa7c800 */
1295 1.32275390625000000000e+00F, /* 0x3fa95000 */
1296 1.33447265625000000000e+00F, /* 0x3faad000 */
1297 1.34619140625000000000e+00F, /* 0x3fac5000 */
1298 1.35766601562500000000e+00F, /* 0x3fadc800 */
1299 1.36914062500000000000e+00F, /* 0x3faf4000 */
1300 1.38061523437500000000e+00F, /* 0x3fb0b800 */
1301 1.39184570312500000000e+00F, /* 0x3fb22800 */
1302 1.40307617187500000000e+00F, /* 0x3fb39800 */
1303 1.41406250000000000000e+00F, /* 0x3fb50000 */
1304 1.42504882812500000000e+00F, /* 0x3fb66800 */
1305 1.43603515625000000000e+00F, /* 0x3fb7d000 */
1306 1.44677734375000000000e+00F, /* 0x3fb93000 */
1307 1.45751953125000000000e+00F, /* 0x3fba9000 */
1308 1.46826171875000000000e+00F, /* 0x3fbbf000 */
1309 1.47900390625000000000e+00F, /* 0x3fbd5000 */
1310 1.48950195312500000000e+00F, /* 0x3fbea800 */
1311 1.50000000000000000000e+00F, /* 0x3fc00000 */
1312 1.51025390625000000000e+00F, /* 0x3fc15000 */
1313 1.52050781250000000000e+00F, /* 0x3fc2a000 */
1314 1.53076171875000000000e+00F, /* 0x3fc3f000 */
1315 1.54101562500000000000e+00F, /* 0x3fc54000 */
1316 1.55102539062500000000e+00F, /* 0x3fc68800 */
1317 1.56103515625000000000e+00F, /* 0x3fc7d000 */
1318 1.57104492187500000000e+00F, /* 0x3fc91800 */
1319 1.58105468750000000000e+00F, /* 0x3fca6000 */
1320 1.59082031250000000000e+00F, /* 0x3fcba000 */
1321 1.60058593750000000000e+00F, /* 0x3fcce000 */
1322 1.61035156250000000000e+00F, /* 0x3fce2000 */
1323 1.62011718750000000000e+00F, /* 0x3fcf6000 */
1324 1.62963867187500000000e+00F, /* 0x3fd09800 */
1325 1.63916015625000000000e+00F, /* 0x3fd1d000 */
1326 1.64868164062500000000e+00F, /* 0x3fd30800 */
1327 1.65820312500000000000e+00F, /* 0x3fd44000 */
1328 1.66748046875000000000e+00F, /* 0x3fd57000 */
1329 1.67700195312500000000e+00F, /* 0x3fd6a800 */
1330 1.68627929687500000000e+00F, /* 0x3fd7d800 */
1331 1.69555664062500000000e+00F, /* 0x3fd90800 */
1332 1.70458984375000000000e+00F, /* 0x3fda3000 */
1333 1.71386718750000000000e+00F, /* 0x3fdb6000 */
1334 1.72290039062500000000e+00F, /* 0x3fdc8800 */
1335 1.73193359375000000000e+00F, /* 0x3fddb000 */
1336 1.74096679687500000000e+00F, /* 0x3fded800 */
1337 1.75000000000000000000e+00F, /* 0x3fe00000 */
1338 1.75878906250000000000e+00F, /* 0x3fe12000 */
1339 1.76757812500000000000e+00F, /* 0x3fe24000 */
1340 1.77636718750000000000e+00F, /* 0x3fe36000 */
1341 1.78515625000000000000e+00F, /* 0x3fe48000 */
1342 1.79394531250000000000e+00F, /* 0x3fe5a000 */
1343 1.80273437500000000000e+00F, /* 0x3fe6c000 */
1344 1.81127929687500000000e+00F, /* 0x3fe7d800 */
1345 1.81982421875000000000e+00F, /* 0x3fe8f000 */
1346 1.82836914062500000000e+00F, /* 0x3fea0800 */
1347 1.83691406250000000000e+00F, /* 0x3feb2000 */
1348 1.84545898437500000000e+00F, /* 0x3fec3800 */
1349 1.85400390625000000000e+00F, /* 0x3fed5000 */
1350 1.86230468750000000000e+00F, /* 0x3fee6000 */
1351 1.87060546875000000000e+00F, /* 0x3fef7000 */
1352 1.87915039062500000000e+00F, /* 0x3ff08800 */
1353 1.88745117187500000000e+00F, /* 0x3ff19800 */
1354 1.89550781250000000000e+00F, /* 0x3ff2a000 */
1355 1.90380859375000000000e+00F, /* 0x3ff3b000 */
1356 1.91210937500000000000e+00F, /* 0x3ff4c000 */
1357 1.92016601562500000000e+00F, /* 0x3ff5c800 */
1358 1.92822265625000000000e+00F, /* 0x3ff6d000 */
1359 1.93627929687500000000e+00F, /* 0x3ff7d800 */
1360 1.94433593750000000000e+00F, /* 0x3ff8e000 */
1361 1.95239257812500000000e+00F, /* 0x3ff9e800 */
1362 1.96044921875000000000e+00F, /* 0x3ffaf000 */
1363 1.96826171875000000000e+00F, /* 0x3ffbf000 */
1364 1.97631835937500000000e+00F, /* 0x3ffcf800 */
1365 1.98413085937500000000e+00F, /* 0x3ffdf800 */
1366 1.99194335937500000000e+00F, /* 0x3ffef800 */
1367 2.00000000000000000000e+00F}; /* 0x40000000 */
1368
1369 static const float rt_jby32_trail_table_float[97] = {
1370 0.00000000000000000000e+00F, /* 0x00000000 */
1371 1.23941208585165441036e-04F, /* 0x3901f637 */
1372 1.46876545841223560274e-05F, /* 0x37766aff */
1373 1.70736297150142490864e-04F, /* 0x393307ad */
1374 1.13296780909877270460e-04F, /* 0x38ed99bf */
1375 9.53458802541717886925e-05F, /* 0x38c7f46e */
1376 1.25126505736261606216e-04F, /* 0x39033464 */
1377 2.10342666832730174065e-04F, /* 0x395c8f6e */
1378 1.14066875539720058441e-04F, /* 0x38ef3730 */
1379 8.72047676239162683487e-05F, /* 0x38b6e1b4 */
1380 1.36111237225122749805e-04F, /* 0x390eb915 */
1381 2.26244374061934649944e-05F, /* 0x37bdc99c */
1382 2.40658700931817293167e-04F, /* 0x397c5954 */
1383 6.31069415248930454254e-05F, /* 0x38845848 */
1384 2.27412077947519719601e-04F, /* 0x396e7577 */
1385 5.90185391047270968556e-06F, /* 0x36c6088a */
1386 1.35496389702893793583e-04F, /* 0x390e1409 */
1387 1.32179571664892137051e-04F, /* 0x390a99af */
1388 0.00000000000000000000e+00F, /* 0x00000000 */
1389 2.31086043640971183777e-04F, /* 0x39724fb0 */
1390 9.66752704698592424393e-05F, /* 0x38cabe24 */
1391 8.85332483449019491673e-05F, /* 0x38b9aaed */
1392 2.09980673389509320259e-04F, /* 0x395c2e42 */
1393 2.20044588786549866199e-04F, /* 0x3966bbc5 */
1394 1.21749282698146998882e-04F, /* 0x38ff53a6 */
1395 1.62125259521417319775e-04F, /* 0x392a002b */
1396 9.97955357888713479042e-05F, /* 0x38d14952 */
1397 1.81545779923908412457e-04F, /* 0x393e5d53 */
1398 1.65768768056295812130e-04F, /* 0x392dd237 */
1399 5.48927710042335093021e-05F, /* 0x38663caa */
1400 9.53875860432162880898e-05F, /* 0x38c80ad2 */
1401 4.53481625299900770187e-05F, /* 0x383e3438 */
1402 1.51062369695864617825e-04F, /* 0x391e667f */
1403 1.70453247847035527229e-04F, /* 0x3932bbb2 */
1404 1.05505387182347476482e-04F, /* 0x38dd42c6 */
1405 2.02269104192964732647e-04F, /* 0x39541833 */
1406 2.18442466575652360916e-04F, /* 0x39650db4 */
1407 1.55796806211583316326e-04F, /* 0x39235d63 */
1408 1.60395247803535312414e-05F, /* 0x37868c9e */
1409 4.49578510597348213196e-05F, /* 0x383c9120 */
1410 0.00000000000000000000e+00F, /* 0x00000000 */
1411 1.26840444863773882389e-04F, /* 0x39050079 */
1412 1.82820076588541269302e-04F, /* 0x393fb364 */
1413 1.69370483490638434887e-04F, /* 0x3931990b */
1414 8.78757418831810355186e-05F, /* 0x38b849ee */
1415 1.83815121999941766262e-04F, /* 0x3940be7f */
1416 2.14343352126888930798e-04F, /* 0x3960c15b */
1417 1.80714370799250900745e-04F, /* 0x393d7e25 */
1418 8.41425862745381891727e-05F, /* 0x38b075b5 */
1419 1.69945167726837098598e-04F, /* 0x3932334f */
1420 1.95121858268976211548e-04F, /* 0x394c99a0 */
1421 1.60778334247879683971e-04F, /* 0x3928969b */
1422 6.79871009197086095810e-05F, /* 0x388e944c */
1423 1.61929419846273958683e-04F, /* 0x3929cb99 */
1424 1.99474830878898501396e-04F, /* 0x39512a1e */
1425 1.81604162207804620266e-04F, /* 0x393e6cff */
1426 1.09270178654696792364e-04F, /* 0x38e527fb */
1427 2.27539261686615645885e-04F, /* 0x396e979b */
1428 4.90300008095800876617e-05F, /* 0x384da590 */
1429 6.28985289949923753738e-05F, /* 0x3883e864 */
1430 2.58551553997676819563e-05F, /* 0x37d8e386 */
1431 1.82868374395184218884e-04F, /* 0x393fc05b */
1432 4.64625991298817098141e-05F, /* 0x3842e0d6 */
1433 1.05703387816902250051e-04F, /* 0x38ddad13 */
1434 1.17213814519345760345e-04F, /* 0x38f5d0b0 */
1435 8.17377731436863541603e-05F, /* 0x38ab6aa2 */
1436 0.00000000000000000000e+00F, /* 0x00000000 */
1437 1.16847433673683553934e-04F, /* 0x38f50bfd */
1438 1.88827965757809579372e-04F, /* 0x3946001f */
1439 2.16612941585481166840e-04F, /* 0x39632298 */
1440 2.00857131858356297016e-04F, /* 0x39529d2d */
1441 1.42199307447299361229e-04F, /* 0x39151b56 */
1442 4.12627305195201188326e-05F, /* 0x382d1185 */
1443 1.42796401632949709892e-04F, /* 0x3915bb9e */
1444 2.03253570361994206905e-04F, /* 0x39552077 */
1445 2.23214170546270906925e-04F, /* 0x396a0e99 */
1446 2.03244591830298304558e-04F, /* 0x39551e0e */
1447 1.43898156238719820976e-04F, /* 0x3916e35e */
1448 4.57155256299301981926e-05F, /* 0x383fbeac */
1449 1.53365719597786664963e-04F, /* 0x3920d0cc */
1450 2.23224633373320102692e-04F, /* 0x396a1168 */
1451 1.16566716314991936088e-05F, /* 0x37439106 */
1452 7.43694272387074306607e-06F, /* 0x36f98ada */
1453 2.11048507480882108212e-04F, /* 0x395d4ce7 */
1454 1.34682719362899661064e-04F, /* 0x390d399e */
1455 2.29425968427676707506e-05F, /* 0x37c074da */
1456 1.20421340398024767637e-04F, /* 0x38fc8ab7 */
1457 1.83421318070031702518e-04F, /* 0x394054c9 */
1458 2.12376224226318299770e-04F, /* 0x395eb14f */
1459 2.07710763788782060146e-04F, /* 0x3959ccef */
1460 1.69840845046564936638e-04F, /* 0x3932174e */
1461 9.91739216260612010956e-05F, /* 0x38cffb98 */
1462 2.40249748458154499531e-04F, /* 0x397beb8d */
1463 1.05178231024183332920e-04F, /* 0x38dc9322 */
1464 1.82623916771262884140e-04F, /* 0x393f7ebc */
1465 2.28821940254420042038e-04F, /* 0x396fefec */
1466 0.00000000000000000000e+00F}; /* 0x00000000 */
1467
1468
1469 /* Handle special arguments first */
1470
1471 GET_BITS_SP32(x, ux);
1472 ax = ux & (~SIGNBIT_SP32);
1473
1474 if(ax >= 0x7f800000)
1475 {
1476 /* x is either NaN or infinity */
1477 if (ux & MANTBITS_SP32)
1478 /* x is NaN */
1479 return x + x; /* Raise invalid if it is a signalling NaN */
1480 else if (ux & SIGNBIT_SP32)
1481 return nanf_with_flags(AMD_F_INVALID);
1482 else
1483 /* x is positive infinity */
1484 return x;
1485 }
1486 else if (ux & SIGNBIT_SP32)
1487 {
1488 /* x is negative. */
1489 if (x == 0.0F)
1490 /* Handle negative zero first */
1491 return x;
1492 else
1493 return nanf_with_flags(AMD_F_INVALID);
1494 }
1495 else if (ux <= 0x007fffff)
1496 {
1497 /* x is denormalised or zero */
1498 if (ux == 0)
1499 /* x is zero */
1500 return x;
1501 else
1502 {
1503 /* x is denormalised; scale it up */
1504 /* Normalize x by increasing the exponent by 26
1505 and subtracting a correction to account for the implicit
1506 bit. This replaces a slow denormalized
1507 multiplication by a fast normal subtraction. */
1508 static const float corr = 7.888609052210118054e-31F; /* 0x0d800000 */
1509 denorm = 1;
1510 GET_BITS_SP32(x, ux);
1511 PUT_BITS_SP32(ux | 0x0d800000, x);
1512 x -= corr;
1513 GET_BITS_SP32(x, ux);
1514 }
1515 }
1516
1517 /* Main algorithm */
1518
1519 /*
1520 Find y and e such that x = 2^e * y, where y in [1,4).
1521 This is done using an in-lined variant of splitFloat,
1522 which also ensures that e is even.
1523 */
1524 y = x;
1525 ux &= EXPBITS_SP32;
1526 ux >>= EXPSHIFTBITS_SP32;
1527 if (ux & 1)
1528 {
1529 GET_BITS_SP32(y, u);
1530 u &= (SIGNBIT_SP32 | MANTBITS_SP32);
1531 u |= ONEEXPBITS_SP32;
1532 PUT_BITS_SP32(u, y);
1533 e = ux - EXPBIAS_SP32;
1534 }
1535 else
1536 {
1537 GET_BITS_SP32(y, u);
1538 u &= (SIGNBIT_SP32 | MANTBITS_SP32);
1539 u |= TWOEXPBITS_SP32;
1540 PUT_BITS_SP32(u, y);
1541 e = ux - EXPBIAS_SP32 - 1;
1542 }
1543
1544 /* Find the index of the sub-interval of [1,4) in which y lies. */
1545
1546 index = (int)(32.0F*y+0.5);
1547
1548 /* Look up the table values and compute c and r = c/t */
1549
1550 rtc_lead = rt_jby32_lead_table_float[index-32];
1551 rtc_trail = rt_jby32_trail_table_float[index-32];
1552 c = 0.03125F*index;
1553 r = (y - c)/c;
1554
1555 /*
1556 Find q = sqrt(1+r) - 1.
1557 From one step of Newton on (q+1)^2 = 1+r
1558 */
1559
1560 p = r*0.5F - r*r*(0.1250079870F - r*(0.6250522999e-01F));
1561 twop = p + p;
1562 q = p - (p*p + (twop - r))/(twop + 2.0);
1563
1564 /* Reconstruction */
1565
1566 rtc = rtc_lead + rtc_trail;
1567 e >>= 1; /* e = e/2 */
1568 z = rtc_lead + (rtc*q+rtc_trail);
1569
1570 if (denorm)
1571 {
1572 /* Scale by 2**(e-13) */
1573 PUT_BITS_SP32(((e - 13) + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r);
1574 z *= r;
1575 }
1576 else
1577 {
1578 /* Scale by 2**e */
1579 PUT_BITS_SP32((e + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r);
1580 z *= r;
1581 }
1582
1583 return z;
1584
1585 }
1586 #endif /* SQRTF_AMD_INLINE */
1587
1588 #ifdef USE_LOG_KERNEL_AMD
log_kernel_amd64(double x,unsigned long long ux,int * xexp,double * r1,double * r2)1589 static inline void log_kernel_amd64(double x, unsigned long long ux, int *xexp, double *r1, double *r2)
1590 {
1591
1592 int expadjust;
1593 double r, z1, z2, correction, f, f1, f2, q, u, v, poly;
1594 int index;
1595
1596 /*
1597 Computes natural log(x). Algorithm based on:
1598 Ping-Tak Peter Tang
1599 "Table-driven implementation of the logarithm function in IEEE
1600 floating-point arithmetic"
1601 ACM Transactions on Mathematical Software (TOMS)
1602 Volume 16, Issue 4 (December 1990)
1603 */
1604
1605 /* Arrays ln_lead_table and ln_tail_table contain
1606 leading and trailing parts respectively of precomputed
1607 values of natural log(1+i/64), for i = 0, 1, ..., 64.
1608 ln_lead_table contains the first 24 bits of precision,
1609 and ln_tail_table contains a further 53 bits precision. */
1610
1611 static const double ln_lead_table[65] = {
1612 0.00000000000000000000e+00, /* 0x0000000000000000 */
1613 1.55041813850402832031e-02, /* 0x3f8fc0a800000000 */
1614 3.07716131210327148438e-02, /* 0x3f9f829800000000 */
1615 4.58095073699951171875e-02, /* 0x3fa7745800000000 */
1616 6.06245994567871093750e-02, /* 0x3faf0a3000000000 */
1617 7.52233862876892089844e-02, /* 0x3fb341d700000000 */
1618 8.96121263504028320312e-02, /* 0x3fb6f0d200000000 */
1619 1.03796780109405517578e-01, /* 0x3fba926d00000000 */
1620 1.17783010005950927734e-01, /* 0x3fbe270700000000 */
1621 1.31576299667358398438e-01, /* 0x3fc0d77e00000000 */
1622 1.45181953907012939453e-01, /* 0x3fc2955280000000 */
1623 1.58604979515075683594e-01, /* 0x3fc44d2b00000000 */
1624 1.71850204467773437500e-01, /* 0x3fc5ff3000000000 */
1625 1.84922337532043457031e-01, /* 0x3fc7ab8900000000 */
1626 1.97825729846954345703e-01, /* 0x3fc9525a80000000 */
1627 2.10564732551574707031e-01, /* 0x3fcaf3c900000000 */
1628 2.23143517971038818359e-01, /* 0x3fcc8ff780000000 */
1629 2.35566020011901855469e-01, /* 0x3fce270700000000 */
1630 2.47836112976074218750e-01, /* 0x3fcfb91800000000 */
1631 2.59957492351531982422e-01, /* 0x3fd0a324c0000000 */
1632 2.71933674812316894531e-01, /* 0x3fd1675c80000000 */
1633 2.83768117427825927734e-01, /* 0x3fd22941c0000000 */
1634 2.95464158058166503906e-01, /* 0x3fd2e8e280000000 */
1635 3.07025015354156494141e-01, /* 0x3fd3a64c40000000 */
1636 3.18453729152679443359e-01, /* 0x3fd4618bc0000000 */
1637 3.29753279685974121094e-01, /* 0x3fd51aad80000000 */
1638 3.40926527976989746094e-01, /* 0x3fd5d1bd80000000 */
1639 3.51976394653320312500e-01, /* 0x3fd686c800000000 */
1640 3.62905442714691162109e-01, /* 0x3fd739d7c0000000 */
1641 3.73716354370117187500e-01, /* 0x3fd7eaf800000000 */
1642 3.84411692619323730469e-01, /* 0x3fd89a3380000000 */
1643 3.94993782043457031250e-01, /* 0x3fd9479400000000 */
1644 4.05465066432952880859e-01, /* 0x3fd9f323c0000000 */
1645 4.15827870368957519531e-01, /* 0x3fda9cec80000000 */
1646 4.26084339618682861328e-01, /* 0x3fdb44f740000000 */
1647 4.36236739158630371094e-01, /* 0x3fdbeb4d80000000 */
1648 4.46287095546722412109e-01, /* 0x3fdc8ff7c0000000 */
1649 4.56237375736236572266e-01, /* 0x3fdd32fe40000000 */
1650 4.66089725494384765625e-01, /* 0x3fddd46a00000000 */
1651 4.75845873355865478516e-01, /* 0x3fde744240000000 */
1652 4.85507786273956298828e-01, /* 0x3fdf128f40000000 */
1653 4.95077252388000488281e-01, /* 0x3fdfaf5880000000 */
1654 5.04556000232696533203e-01, /* 0x3fe02552a0000000 */
1655 5.13945698738098144531e-01, /* 0x3fe0723e40000000 */
1656 5.23248136043548583984e-01, /* 0x3fe0be72e0000000 */
1657 5.32464742660522460938e-01, /* 0x3fe109f380000000 */
1658 5.41597247123718261719e-01, /* 0x3fe154c3c0000000 */
1659 5.50647079944610595703e-01, /* 0x3fe19ee6a0000000 */
1660 5.59615731239318847656e-01, /* 0x3fe1e85f40000000 */
1661 5.68504691123962402344e-01, /* 0x3fe23130c0000000 */
1662 5.77315330505371093750e-01, /* 0x3fe2795e00000000 */
1663 5.86049020290374755859e-01, /* 0x3fe2c0e9e0000000 */
1664 5.94707071781158447266e-01, /* 0x3fe307d720000000 */
1665 6.03290796279907226562e-01, /* 0x3fe34e2880000000 */
1666 6.11801505088806152344e-01, /* 0x3fe393e0c0000000 */
1667 6.20240390300750732422e-01, /* 0x3fe3d90260000000 */
1668 6.28608644008636474609e-01, /* 0x3fe41d8fe0000000 */
1669 6.36907458305358886719e-01, /* 0x3fe4618bc0000000 */
1670 6.45137906074523925781e-01, /* 0x3fe4a4f840000000 */
1671 6.53301239013671875000e-01, /* 0x3fe4e7d800000000 */
1672 6.61398470401763916016e-01, /* 0x3fe52a2d20000000 */
1673 6.69430613517761230469e-01, /* 0x3fe56bf9c0000000 */
1674 6.77398800849914550781e-01, /* 0x3fe5ad4040000000 */
1675 6.85303986072540283203e-01, /* 0x3fe5ee02a0000000 */
1676 6.93147122859954833984e-01}; /* 0x3fe62e42e0000000 */
1677
1678 static const double ln_tail_table[65] = {
1679 0.00000000000000000000e+00, /* 0x0000000000000000 */
1680 5.15092497094772879206e-09, /* 0x3e361f807c79f3db */
1681 4.55457209735272790188e-08, /* 0x3e6873c1980267c8 */
1682 2.86612990859791781788e-08, /* 0x3e5ec65b9f88c69e */
1683 2.23596477332056055352e-08, /* 0x3e58022c54cc2f99 */
1684 3.49498983167142274770e-08, /* 0x3e62c37a3a125330 */
1685 3.23392843005887000414e-08, /* 0x3e615cad69737c93 */
1686 1.35722380472479366661e-08, /* 0x3e4d256ab1b285e9 */
1687 2.56504325268044191098e-08, /* 0x3e5b8abcb97a7aa2 */
1688 5.81213608741512136843e-08, /* 0x3e6f34239659a5dc */
1689 5.59374849578288093334e-08, /* 0x3e6e07fd48d30177 */
1690 5.06615629004996189970e-08, /* 0x3e6b32df4799f4f6 */
1691 5.24588857848400955725e-08, /* 0x3e6c29e4f4f21cf8 */
1692 9.61968535632653505972e-10, /* 0x3e1086c848df1b59 */
1693 1.34829655346594463137e-08, /* 0x3e4cf456b4764130 */
1694 3.65557749306383026498e-08, /* 0x3e63a02ffcb63398 */
1695 3.33431709374069198903e-08, /* 0x3e61e6a6886b0976 */
1696 5.13008650536088382197e-08, /* 0x3e6b8abcb97a7aa2 */
1697 5.09285070380306053751e-08, /* 0x3e6b578f8aa35552 */
1698 3.20853940845502057341e-08, /* 0x3e6139c871afb9fc */
1699 4.06713248643004200446e-08, /* 0x3e65d5d30701ce64 */
1700 5.57028186706125221168e-08, /* 0x3e6de7bcb2d12142 */
1701 5.48356693724804282546e-08, /* 0x3e6d708e984e1664 */
1702 1.99407553679345001938e-08, /* 0x3e556945e9c72f36 */
1703 1.96585517245087232086e-09, /* 0x3e20e2f613e85bda */
1704 6.68649386072067321503e-09, /* 0x3e3cb7e0b42724f6 */
1705 5.89936034642113390002e-08, /* 0x3e6fac04e52846c7 */
1706 2.85038578721554472484e-08, /* 0x3e5e9b14aec442be */
1707 5.09746772910284482606e-08, /* 0x3e6b5de8034e7126 */
1708 5.54234668933210171467e-08, /* 0x3e6dc157e1b259d3 */
1709 6.29100830926604004874e-09, /* 0x3e3b05096ad69c62 */
1710 2.61974119468563937716e-08, /* 0x3e5c2116faba4cdd */
1711 4.16752115011186398935e-08, /* 0x3e665fcc25f95b47 */
1712 2.47747534460820790327e-08, /* 0x3e5a9a08498d4850 */
1713 5.56922172017964209793e-08, /* 0x3e6de647b1465f77 */
1714 2.76162876992552906035e-08, /* 0x3e5da71b7bf7861d */
1715 7.08169709942321478061e-09, /* 0x3e3e6a6886b09760 */
1716 5.77453510221151779025e-08, /* 0x3e6f0075eab0ef64 */
1717 4.43021445893361960146e-09, /* 0x3e33071282fb989b */
1718 3.15140984357495864573e-08, /* 0x3e60eb43c3f1bed2 */
1719 2.95077445089736670973e-08, /* 0x3e5faf06ecb35c84 */
1720 1.44098510263167149349e-08, /* 0x3e4ef1e63db35f68 */
1721 1.05196987538551827693e-08, /* 0x3e469743fb1a71a5 */
1722 5.23641361722697546261e-08, /* 0x3e6c1cdf404e5796 */
1723 7.72099925253243069458e-09, /* 0x3e4094aa0ada625e */
1724 5.62089493829364197156e-08, /* 0x3e6e2d4c96fde3ec */
1725 3.53090261098577946927e-08, /* 0x3e62f4d5e9a98f34 */
1726 3.80080516835568242269e-08, /* 0x3e6467c96ecc5cbe */
1727 5.66961038386146408282e-08, /* 0x3e6e7040d03dec5a */
1728 4.42287063097349852717e-08, /* 0x3e67bebf4282de36 */
1729 3.45294525105681104660e-08, /* 0x3e6289b11aeb783f */
1730 2.47132034530447431509e-08, /* 0x3e5a891d1772f538 */
1731 3.59655343422487209774e-08, /* 0x3e634f10be1fb591 */
1732 5.51581770357780862071e-08, /* 0x3e6d9ce1d316eb93 */
1733 3.60171867511861372793e-08, /* 0x3e63562a19a9c442 */
1734 1.94511067964296180547e-08, /* 0x3e54e2adf548084c */
1735 1.54137376631349347838e-08, /* 0x3e508ce55cc8c97a */
1736 3.93171034490174464173e-09, /* 0x3e30e2f613e85bda */
1737 5.52990607758839766440e-08, /* 0x3e6db03ebb0227bf */
1738 3.29990737637586136511e-08, /* 0x3e61b75bb09cb098 */
1739 1.18436010922446096216e-08, /* 0x3e496f16abb9df22 */
1740 4.04248680368301346709e-08, /* 0x3e65b3f399411c62 */
1741 2.27418915900284316293e-08, /* 0x3e586b3e59f65355 */
1742 1.70263791333409206020e-08, /* 0x3e52482ceae1ac12 */
1743 5.76999904754328540596e-08}; /* 0x3e6efa39ef35793c */
1744
1745 /* Approximating polynomial coefficients for x near 1.0 */
1746 static const double
1747 ca_1 = 8.33333333333317923934e-02, /* 0x3fb55555555554e6 */
1748 ca_2 = 1.25000000037717509602e-02, /* 0x3f89999999bac6d4 */
1749 ca_3 = 2.23213998791944806202e-03, /* 0x3f62492307f1519f */
1750 ca_4 = 4.34887777707614552256e-04; /* 0x3f3c8034c85dfff0 */
1751
1752 /* Approximating polynomial coefficients for other x */
1753 static const double
1754 cb_1 = 8.33333333333333593622e-02, /* 0x3fb5555555555557 */
1755 cb_2 = 1.24999999978138668903e-02, /* 0x3f89999999865ede */
1756 cb_3 = 2.23219810758559851206e-03; /* 0x3f6249423bd94741 */
1757
1758 static const unsigned long long
1759 log_thresh1 = 0x3fee0faa00000000,
1760 log_thresh2 = 0x3ff1082c00000000;
1761
1762 /* log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000
1763 log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 */
1764 if (ux >= log_thresh1 && ux <= log_thresh2)
1765 {
1766 /* Arguments close to 1.0 are handled separately to maintain
1767 accuracy.
1768
1769 The approximation in this region exploits the identity
1770 log( 1 + r ) = log( 1 + u/2 ) / log( 1 - u/2 ), where
1771 u = 2r / (2+r).
1772 Note that the right hand side has an odd Taylor series expansion
1773 which converges much faster than the Taylor series expansion of
1774 log( 1 + r ) in r. Thus, we approximate log( 1 + r ) by
1775 u + A1 * u^3 + A2 * u^5 + ... + An * u^(2n+1).
1776
1777 One subtlety is that since u cannot be calculated from
1778 r exactly, the rounding error in the first u should be
1779 avoided if possible. To accomplish this, we observe that
1780 u = r - r*r/(2+r).
1781 Since x (=1+r) is the input argument, and thus presumed exact,
1782 the formula above approximates u accurately because
1783 u = r - correction,
1784 and the magnitude of "correction" (of the order of r*r)
1785 is small.
1786 With these observations, we will approximate log( 1 + r ) by
1787 r + ( (A1*u^3 + ... + An*u^(2n+1)) - correction ).
1788
1789 We approximate log(1+r) by an odd polynomial in u, where
1790 u = 2r/(2+r) = r - r*r/(2+r).
1791 */
1792 r = x - 1.0;
1793 u = r / (2.0 + r);
1794 correction = r * u;
1795 u = u + u;
1796 v = u * u;
1797 z1 = r;
1798 z2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction);
1799 *r1 = z1;
1800 *r2 = z2;
1801 *xexp = 0;
1802 }
1803 else
1804 {
1805 /*
1806 First, we decompose the argument x to the form
1807 x = 2**M * (F1 + F2),
1808 where 1 <= F1+F2 < 2, M has the value of an integer,
1809 F1 = 1 + j/64, j ranges from 0 to 64, and |F2| <= 1/128.
1810
1811 Second, we approximate log( 1 + F2/F1 ) by an odd polynomial
1812 in U, where U = 2 F2 / (2 F2 + F1).
1813 Note that log( 1 + F2/F1 ) = log( 1 + U/2 ) - log( 1 - U/2 ).
1814 The core approximation calculates
1815 Poly = [log( 1 + U/2 ) - log( 1 - U/2 )]/U - 1.
1816 Note that log(1 + U/2) - log(1 - U/2) = 2 arctanh ( U/2 ),
1817 thus, Poly = 2 arctanh( U/2 ) / U - 1.
1818
1819 It is not hard to see that
1820 log(x) = M*log(2) + log(F1) + log( 1 + F2/F1 ).
1821 Hence, we return Z1 = log(F1), and Z2 = log( 1 + F2/F1).
1822 The values of log(F1) are calculated beforehand and stored
1823 in the program.
1824 */
1825
1826 f = x;
1827 if (ux < IMPBIT_DP64)
1828 {
1829 /* The input argument x is denormalized */
1830 /* Normalize f by increasing the exponent by 60
1831 and subtracting a correction to account for the implicit
1832 bit. This replaces a slow denormalized
1833 multiplication by a fast normal subtraction. */
1834 static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */
1835 GET_BITS_DP64(f, ux);
1836 ux |= 0x03d0000000000000;
1837 PUT_BITS_DP64(ux, f);
1838 f -= corr;
1839 GET_BITS_DP64(f, ux);
1840 expadjust = 60;
1841 }
1842 else
1843 expadjust = 0;
1844
1845 /* Store the exponent of x in xexp and put
1846 f into the range [0.5,1) */
1847 *xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 - expadjust;
1848 PUT_BITS_DP64((ux & MANTBITS_DP64) | HALFEXPBITS_DP64, f);
1849
1850 /* Now x = 2**xexp * f, 1/2 <= f < 1. */
1851
1852 /* Set index to be the nearest integer to 128*f */
1853 r = 128.0 * f;
1854 index = (int)(r + 0.5);
1855
1856 z1 = ln_lead_table[index-64];
1857 q = ln_tail_table[index-64];
1858 f1 = index * 0.0078125; /* 0.0078125 = 1/128 */
1859 f2 = f - f1;
1860 /* At this point, x = 2**xexp * ( f1 + f2 ) where
1861 f1 = j/128, j = 64, 65, ..., 128 and |f2| <= 1/256. */
1862
1863 /* Calculate u = 2 f2 / ( 2 f1 + f2 ) = f2 / ( f1 + 0.5*f2 ) */
1864 /* u = f2 / (f1 + 0.5 * f2); */
1865 u = f2 / (f1 + 0.5 * f2);
1866
1867 /* Here, |u| <= 2(exp(1/16)-1) / (exp(1/16)+1).
1868 The core approximation calculates
1869 poly = [log(1 + u/2) - log(1 - u/2)]/u - 1 */
1870 v = u * u;
1871 poly = (v * (cb_1 + v * (cb_2 + v * cb_3)));
1872 z2 = q + (u + u * poly);
1873 *r1 = z1;
1874 *r2 = z2;
1875 }
1876 return;
1877 }
1878 #endif /* USE_LOG_KERNEL_AMD */
1879
1880 #if defined(USE_REMAINDER_PIBY2F_INLINE)
1881 /* Define this to get debugging print statements activated */
1882 #define DEBUGGING_PRINT
1883 #undef DEBUGGING_PRINT
1884
1885
1886 #ifdef DEBUGGING_PRINT
1887 #include <stdio.h>
d2b(long long d,int bitsper,int point)1888 char *d2b(long long d, int bitsper, int point)
1889 {
1890 static char buff[200];
1891 int i, j;
1892 j = bitsper;
1893 if (point >= 0 && point <= bitsper)
1894 j++;
1895 buff[j] = '\0';
1896 for (i = bitsper - 1; i >= 0; i--)
1897 {
1898 j--;
1899 if (d % 2 == 1)
1900 buff[j] = '1';
1901 else
1902 buff[j] = '0';
1903 if (i == point)
1904 {
1905 j--;
1906 buff[j] = '.';
1907 }
1908 d /= 2;
1909 }
1910 return buff;
1911 }
1912 #endif
1913
1914 /* Given positive argument x, reduce it to the range [-pi/4,pi/4] using
1915 extra precision, and return the result in r.
1916 Return value "region" tells how many lots of pi/2 were subtracted
1917 from x to put it in the range [-pi/4,pi/4], mod 4. */
__remainder_piby2f_inline(unsigned long long ux,double * r,int * region)1918 static inline void __remainder_piby2f_inline(unsigned long long ux, double *r, int *region)
1919 {
1920
1921 /* This method simulates multi-precision floating-point
1922 arithmetic and is accurate for all 1 <= x < infinity */
1923 #if 0
1924 const int bitsper = 36;
1925 #else
1926 #define bitsper 36
1927 #endif
1928 unsigned long long res[10];
1929 unsigned long long u, carry, mask, mant, nextbits;
1930 int first, last, i, rexp, xexp, resexp, ltb, determ, bc;
1931 double dx;
1932 static const double
1933 piby2 = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */
1934 static unsigned long long pibits[] =
1935 {
1936 0LL,
1937 5215LL, 13000023176LL, 11362338026LL, 67174558139LL,
1938 34819822259LL, 10612056195LL, 67816420731LL, 57840157550LL,
1939 19558516809LL, 50025467026LL, 25186875954LL, 18152700886LL
1940 };
1941
1942 xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64);
1943 ux = ((ux & MANTBITS_DP64) | IMPBIT_DP64) >> 29;
1944
1945
1946 /* Now ux is the mantissa bit pattern of x as a long long integer */
1947 mask = 1;
1948 mask = (mask << bitsper) - 1;
1949
1950 /* Set first and last to the positions of the first
1951 and last chunks of 2/pi that we need */
1952 first = xexp / bitsper;
1953 resexp = xexp - first * bitsper;
1954 /* 120 is the theoretical maximum number of bits (actually
1955 115 for IEEE single precision) that we need to extract
1956 from the middle of 2/pi to compute the reduced argument
1957 accurately enough for our purposes */
1958 last = first + 120 / bitsper;
1959
1960 #ifdef DEBUGGING_PRINT
1961 printf("first = %d, last = %d\n", first, last);
1962 #endif
1963
1964 /* Do a long multiplication of the bits of 2/pi by the
1965 integer mantissa */
1966 #if 0
1967 for (i = last; i >= first; i--)
1968 {
1969 u = pibits[i] * ux + carry;
1970 res[i - first] = u & mask;
1971 carry = u >> bitsper;
1972 }
1973 res[last - first + 1] = 0;
1974 #else
1975 /* Unroll the loop. This is only correct because we know
1976 that bitsper is fixed as 36. */
1977 res[4] = 0;
1978 u = pibits[last] * ux;
1979 res[3] = u & mask;
1980 carry = u >> bitsper;
1981 u = pibits[last - 1] * ux + carry;
1982 res[2] = u & mask;
1983 carry = u >> bitsper;
1984 u = pibits[last - 2] * ux + carry;
1985 res[1] = u & mask;
1986 carry = u >> bitsper;
1987 u = pibits[first] * ux + carry;
1988 res[0] = u & mask;
1989 #endif
1990
1991 #ifdef DEBUGGING_PRINT
1992 printf("resexp = %d\n", resexp);
1993 printf("Significant part of x * 2/pi with binary"
1994 " point in correct place:\n");
1995 for (i = 0; i <= last - first; i++)
1996 {
1997 if (i > 0 && i % 5 == 0)
1998 printf("\n ");
1999 if (i == 1)
2000 printf("%s ", d2b(res[i], bitsper, resexp));
2001 else
2002 printf("%s ", d2b(res[i], bitsper, -1));
2003 }
2004 printf("\n");
2005 #endif
2006
2007 /* Reconstruct the result */
2008 ltb = (int)((((res[0] << bitsper) | res[1])
2009 >> (bitsper - 1 - resexp)) & 7);
2010
2011 /* determ says whether the fractional part is >= 0.5 */
2012 determ = ltb & 1;
2013
2014 #ifdef DEBUGGING_PRINT
2015 printf("ltb = %d (last two bits before binary point"
2016 " and first bit after)\n", ltb);
2017 printf("determ = %d (1 means need to negate because the fractional\n"
2018 " part of x * 2/pi is greater than 0.5)\n", determ);
2019 #endif
2020
2021 i = 1;
2022 if (determ)
2023 {
2024 /* The mantissa is >= 0.5. We want to subtract it
2025 from 1.0 by negating all the bits */
2026 *region = ((ltb >> 1) + 1) & 3;
2027 mant = 1;
2028 mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1);
2029 while (mant < 0x0000000000010000)
2030 {
2031 i++;
2032 mant = (mant << bitsper) | (~(res[i]) & mask);
2033 }
2034 nextbits = (~(res[i+1]) & mask);
2035 }
2036 else
2037 {
2038 *region = (ltb >> 1);
2039 mant = 1;
2040 mant = res[1] & ((mant << (bitsper - resexp)) - 1);
2041 while (mant < 0x0000000000010000)
2042 {
2043 i++;
2044 mant = (mant << bitsper) | res[i];
2045 }
2046 nextbits = res[i+1];
2047 }
2048
2049 #ifdef DEBUGGING_PRINT
2050 printf("First bits of mant = %s\n", d2b(mant, bitsper, -1));
2051 #endif
2052
2053 /* Normalize the mantissa. The shift value 6 here, determined by
2054 trial and error, seems to give optimal speed. */
2055 bc = 0;
2056 while (mant < 0x0000400000000000)
2057 {
2058 bc += 6;
2059 mant <<= 6;
2060 }
2061 while (mant < 0x0010000000000000)
2062 {
2063 bc++;
2064 mant <<= 1;
2065 }
2066 mant |= nextbits >> (bitsper - bc);
2067
2068 rexp = 52 + resexp - bc - i * bitsper;
2069
2070 #ifdef DEBUGGING_PRINT
2071 printf("Normalised mantissa = 0x%016lx\n", mant);
2072 printf("Exponent to be inserted on mantissa = rexp = %d\n", rexp);
2073 #endif
2074
2075 /* Put the result exponent rexp onto the mantissa pattern */
2076 u = ((unsigned long long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64;
2077 ux = (mant & MANTBITS_DP64) | u;
2078 if (determ)
2079 /* If we negated the mantissa we negate x too */
2080 ux |= SIGNBIT_DP64;
2081 PUT_BITS_DP64(ux, dx);
2082
2083 #ifdef DEBUGGING_PRINT
2084 printf("(x*2/pi) = %25.20e = %s\n", dx, double2hex(&dx));
2085 #endif
2086
2087 /* x is a double precision version of the fractional part of
2088 x * 2 / pi. Multiply x by pi/2 in double precision
2089 to get the reduced argument r. */
2090 *r = dx * piby2;
2091
2092 #ifdef DEBUGGING_PRINT
2093 printf(" r = frac(x*2/pi) * pi/2:\n");
2094 printf(" r = %25.20e = %s\n", *r, double2hex(r));
2095 printf("region = (number of pi/2 subtracted from x) mod 4 = %d\n",
2096 *region);
2097 #endif
2098 }
2099 #endif /* USE_REMAINDER_PIBY2F_INLINE */
2100
2101 #endif /* LIBM_INLINES_AMD_H_INCLUDED */
2102