1 /***********************************************************************************/ 2 /** MIT License **/ 3 /** ----------- **/ 4 /** **/ 5 /** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/ 6 /** **/ 7 /** Permission is hereby granted, free of charge, to any person obtaining a copy **/ 8 /** of this Software and associated documentaon files (the "Software"), to deal **/ 9 /** in the Software without restriction, including without limitation the rights **/ 10 /** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/ 11 /** copies of the Software, and to permit persons to whom the Software is **/ 12 /** furnished to do so, subject to the following conditions: **/ 13 /** **/ 14 /** The above copyright notice and this permission notice shall be included in **/ 15 /** all copies or substantial portions of the Software. **/ 16 /** **/ 17 /** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/ 18 /** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/ 19 /** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/ 20 /** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/ 21 /** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/ 22 /** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/ 23 /** THE SOFTWARE. **/ 24 /***********************************************************************************/ 25 26 #ifndef LIBM_INLINES_AMD_H_INCLUDED 27 #define LIBM_INLINES_AMD_H_INCLUDED 1 28 29 #include "libm_util.h" 30 31 /* Set defines for inline functions calling other inlines */ 32 #if defined(USE_VAL_WITH_FLAGS) || defined(USE_VALF_WITH_FLAGS) || \ 33 defined(USE_ZERO_WITH_FLAGS) || defined(USE_ZEROF_WITH_FLAGS) || \ 34 defined(USE_NAN_WITH_FLAGS) || defined(USE_NANF_WITH_FLAGS) || \ 35 defined(USE_INDEFINITE_WITH_FLAGS) || defined(USE_INDEFINITEF_WITH_FLAGS) || \ 36 defined(USE_INFINITY_WITH_FLAGS) || defined(USE_INFINITYF_WITH_FLAGS) || \ 37 defined(USE_SQRT_AMD_INLINE) || defined(USE_SQRTF_AMD_INLINE) || \ 38 (defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF)) 39 #undef USE_RAISE_FPSW_FLAGS 40 #define USE_RAISE_FPSW_FLAGS 1 41 #endif 42 43 #if defined(USE_SPLITDOUBLE) 44 /* Splits double x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0. 45 Assumes that x is not zero, denormal, infinity or NaN, but these conditions 46 are not checked */ 47 static inline void splitDouble(double x, int *e, double *m) 48 { 49 unsigned long long ux, uy; 50 GET_BITS_DP64(x, ux); 51 uy = ux; 52 ux &= EXPBITS_DP64; 53 ux >>= EXPSHIFTBITS_DP64; 54 *e = (int)ux - EXPBIAS_DP64 + 1; 55 uy = (uy & (SIGNBIT_DP64 | MANTBITS_DP64)) | HALFEXPBITS_DP64; 56 PUT_BITS_DP64(uy, x); 57 *m = x; 58 } 59 #endif /* USE_SPLITDOUBLE */ 60 61 62 #if defined(USE_SPLITDOUBLE_2) 63 /* Splits double x into exponent e and mantissa m, where 1.0 <= abs(m) < 4.0. 64 Assumes that x is not zero, denormal, infinity or NaN, but these conditions 65 are not checked. Also assumes EXPBIAS_DP is odd. With this 66 assumption, e will be even on exit. */ 67 static inline void splitDouble_2(double x, int *e, double *m) 68 { 69 unsigned long long ux, vx; 70 GET_BITS_DP64(x, ux); 71 vx = ux; 72 ux &= EXPBITS_DP64; 73 ux >>= EXPSHIFTBITS_DP64; 74 if (ux & 1) 75 { 76 /* The exponent is odd */ 77 vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | ONEEXPBITS_DP64; 78 PUT_BITS_DP64(vx, x); 79 *m = x; 80 *e = ux - EXPBIAS_DP64; 81 } 82 else 83 { 84 /* The exponent is even */ 85 vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | TWOEXPBITS_DP64; 86 PUT_BITS_DP64(vx, x); 87 *m = x; 88 *e = ux - EXPBIAS_DP64 - 1; 89 } 90 } 91 #endif /* USE_SPLITDOUBLE_2 */ 92 93 94 #if defined(USE_SPLITFLOAT) 95 /* Splits float x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0. 96 Assumes that x is not zero, denormal, infinity or NaN, but these conditions 97 are not checked */ 98 static inline void splitFloat(float x, int *e, float *m) 99 { 100 unsigned int ux, uy; 101 GET_BITS_SP32(x, ux); 102 uy = ux; 103 ux &= EXPBITS_SP32; 104 ux >>= EXPSHIFTBITS_SP32; 105 *e = (int)ux - EXPBIAS_SP32 + 1; 106 uy = (uy & (SIGNBIT_SP32 | MANTBITS_SP32)) | HALFEXPBITS_SP32; 107 PUT_BITS_SP32(uy, x); 108 *m = x; 109 } 110 #endif /* USE_SPLITFLOAT */ 111 112 113 #if defined(USE_SCALEDOUBLE_1) 114 /* Scales the double x by 2.0**n. 115 Assumes EMIN <= n <= EMAX, though this condition is not checked. */ 116 static inline double scaleDouble_1(double x, int n) 117 { 118 double t; 119 /* Construct the number t = 2.0**n */ 120 PUT_BITS_DP64(((long long)n + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t); 121 return x*t; 122 } 123 #endif /* USE_SCALEDOUBLE_1 */ 124 125 126 #if defined(USE_SCALEDOUBLE_2) 127 /* Scales the double x by 2.0**n. 128 Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */ 129 static inline double scaleDouble_2(double x, int n) 130 { 131 double t1, t2; 132 int n1, n2; 133 n1 = n / 2; 134 n2 = n - n1; 135 /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */ 136 PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1); 137 PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2); 138 return (x*t1)*t2; 139 } 140 #endif /* USE_SCALEDOUBLE_2 */ 141 142 143 #if defined(USE_SCALEDOUBLE_3) 144 /* Scales the double x by 2.0**n. 145 Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */ 146 static inline double scaleDouble_3(double x, int n) 147 { 148 double t1, t2, t3; 149 int n1, n2, n3; 150 n1 = n / 3; 151 n2 = (n - n1) / 2; 152 n3 = n - n1 - n2; 153 /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */ 154 PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1); 155 PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2); 156 PUT_BITS_DP64(((long long)n3 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t3); 157 return ((x*t1)*t2)*t3; 158 } 159 #endif /* USE_SCALEDOUBLE_3 */ 160 161 162 #if defined(USE_SCALEFLOAT_1) 163 /* Scales the float x by 2.0**n. 164 Assumes EMIN <= n <= EMAX, though this condition is not checked. */ 165 static inline float scaleFloat_1(float x, int n) 166 { 167 float t; 168 /* Construct the number t = 2.0**n */ 169 PUT_BITS_SP32((n + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t); 170 return x*t; 171 } 172 #endif /* USE_SCALEFLOAT_1 */ 173 174 175 #if defined(USE_SCALEFLOAT_2) 176 /* Scales the float x by 2.0**n. 177 Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */ 178 static inline float scaleFloat_2(float x, int n) 179 { 180 float t1, t2; 181 int n1, n2; 182 n1 = n / 2; 183 n2 = n - n1; 184 /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */ 185 PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1); 186 PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2); 187 return (x*t1)*t2; 188 } 189 #endif /* USE_SCALEFLOAT_2 */ 190 191 192 #if defined(USE_SCALEFLOAT_3) 193 /* Scales the float x by 2.0**n. 194 Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */ 195 static inline float scaleFloat_3(float x, int n) 196 { 197 float t1, t2, t3; 198 int n1, n2, n3; 199 n1 = n / 3; 200 n2 = (n - n1) / 2; 201 n3 = n - n1 - n2; 202 /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */ 203 PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1); 204 PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2); 205 PUT_BITS_SP32((n3 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t3); 206 return ((x*t1)*t2)*t3; 207 } 208 #endif /* USE_SCALEFLOAT_3 */ 209 210 #if defined(USE_SETPRECISIONDOUBLE) 211 unsigned int setPrecisionDouble(void) 212 { 213 unsigned int cw, cwold = 0; 214 /* There is no precision control on Hammer */ 215 return cwold; 216 } 217 #endif /* USE_SETPRECISIONDOUBLE */ 218 219 #if defined(USE_RESTOREPRECISION) 220 void restorePrecision(unsigned int cwold) 221 { 222 /* There is no precision control on Hammer */ 223 return; 224 } 225 #endif /* USE_RESTOREPRECISION */ 226 227 228 229 230 #if defined(USE_RAISE_FPSW_FLAGS) 231 /* Raises floating-point status flags. The argument should be 232 the bitwise or of the flags to be raised, from the 233 list above, e.g. 234 raise_fpsw_flags(AMD_F_INEXACT | AMD_F_INVALID); 235 */ 236 237 /* ISSUE - wat - 08182010 238 * These AMD_ISW_* flags are duplicated from trans.h 239 * this is not clean; Mark S. did it for targeted fix of 855457 240 * Eliminate all redundant flags in the next overhaul 241 */ 242 243 #define AMD_ISW_INVALID 0x0001 244 #define AMD_ISW_DENORMAL 0x0002 245 #define AMD_ISW_ZERODIVIDE 0x0004 246 #define AMD_ISW_OVERFLOW 0x0008 247 #define AMD_ISW_UNDERFLOW 0x0010 248 #define AMD_ISW_INEXACT 0x0020 249 250 /* use this function from fpctrl.c */ 251 void _set_statfp(uintptr_t); 252 253 static inline void raise_fpsw_flags(int flags) 254 { 255 unsigned int f = 0; 256 257 if (flags & AMD_F_OVERFLOW) { f |= AMD_ISW_OVERFLOW; } 258 if (flags & AMD_F_UNDERFLOW) { f |= AMD_ISW_UNDERFLOW; } 259 if (flags & AMD_F_DIVBYZERO) { f |= AMD_ISW_ZERODIVIDE; } 260 if (flags & AMD_F_INVALID) { f |= AMD_ISW_INVALID; } 261 if (flags & AMD_F_INEXACT) { f |= AMD_ISW_INEXACT; } 262 263 _set_statfp(f); 264 } 265 266 #endif /* USE_RAISE_FPSW_FLAGS */ 267 268 269 #if defined(USE_GET_FPSW_INLINE) 270 /* Return the current floating-point status word */ 271 static inline unsigned int get_fpsw_inline(void) 272 { 273 return _mm_getcsr(); 274 } 275 #endif /* USE_GET_FPSW_INLINE */ 276 277 #if defined(USE_SET_FPSW_INLINE) 278 /* Set the floating-point status word */ 279 static inline void set_fpsw_inline(unsigned int sw) 280 { 281 _mm_setcsr(sw); 282 } 283 #endif /* USE_SET_FPSW_INLINE */ 284 285 286 287 #if defined(USE_VAL_WITH_FLAGS) 288 /* Returns a double value after raising the given flags, 289 e.g. val_with_flags(AMD_F_INEXACT); 290 */ 291 static inline double val_with_flags(double val, int flags) 292 { 293 raise_fpsw_flags(flags); 294 return val; 295 } 296 #endif /* USE_VAL_WITH_FLAGS */ 297 298 #if defined(USE_VALF_WITH_FLAGS) 299 /* Returns a float value after raising the given flags, 300 e.g. valf_with_flags(AMD_F_INEXACT); 301 */ 302 static inline float valf_with_flags(float val, int flags) 303 { 304 raise_fpsw_flags(flags); 305 return val; 306 } 307 #endif /* USE_VALF_WITH_FLAGS */ 308 309 310 #if defined(USE_ZERO_WITH_FLAGS) 311 /* Returns a double +zero after raising the given flags, 312 e.g. zero_with_flags(AMD_F_INEXACT | AMD_F_INVALID); 313 */ 314 static inline double zero_with_flags(int flags) 315 { 316 raise_fpsw_flags(flags); 317 return 0.0; 318 } 319 #endif /* USE_ZERO_WITH_FLAGS */ 320 321 322 #if defined(USE_ZEROF_WITH_FLAGS) 323 /* Returns a float +zero after raising the given flags, 324 e.g. zerof_with_flags(AMD_F_INEXACT | AMD_F_INVALID); 325 */ 326 static inline float zerof_with_flags(int flags) 327 { 328 raise_fpsw_flags(flags); 329 return 0.0F; 330 } 331 #endif /* USE_ZEROF_WITH_FLAGS */ 332 333 334 #if defined(USE_NAN_WITH_FLAGS) 335 /* Returns a double quiet +nan after raising the given flags, 336 e.g. nan_with_flags(AMD_F_INVALID); 337 */ 338 static inline double nan_with_flags(int flags) 339 { 340 double z; 341 raise_fpsw_flags(flags); 342 PUT_BITS_DP64(0x7ff8000000000000, z); 343 return z; 344 } 345 #endif /* USE_NAN_WITH_FLAGS */ 346 347 #if defined(USE_NANF_WITH_FLAGS) 348 /* Returns a float quiet +nan after raising the given flags, 349 e.g. nanf_with_flags(AMD_F_INVALID); 350 */ 351 static inline float nanf_with_flags(int flags) 352 { 353 float z; 354 raise_fpsw_flags(flags); 355 PUT_BITS_SP32(0x7fc00000, z); 356 return z; 357 } 358 #endif /* USE_NANF_WITH_FLAGS */ 359 360 361 #if defined(USE_INDEFINITE_WITH_FLAGS) 362 /* Returns a double indefinite after raising the given flags, 363 e.g. indefinite_with_flags(AMD_F_INVALID); 364 */ 365 static inline double indefinite_with_flags(int flags) 366 { 367 double z; 368 raise_fpsw_flags(flags); 369 PUT_BITS_DP64(0xfff8000000000000, z); 370 return z; 371 } 372 #endif /* USE_INDEFINITE_WITH_FLAGS */ 373 374 #if defined(USE_INDEFINITEF_WITH_FLAGS) 375 /* Returns a float quiet +indefinite after raising the given flags, 376 e.g. indefinitef_with_flags(AMD_F_INVALID); 377 */ 378 static inline float indefinitef_with_flags(int flags) 379 { 380 float z; 381 raise_fpsw_flags(flags); 382 PUT_BITS_SP32(0xffc00000, z); 383 return z; 384 } 385 #endif /* USE_INDEFINITEF_WITH_FLAGS */ 386 387 388 #ifdef USE_INFINITY_WITH_FLAGS 389 /* Returns a positive double infinity after raising the given flags, 390 e.g. infinity_with_flags(AMD_F_OVERFLOW); 391 */ 392 static inline double infinity_with_flags(int flags) 393 { 394 double z; 395 raise_fpsw_flags(flags); 396 PUT_BITS_DP64((unsigned long long)(BIASEDEMAX_DP64 + 1) << EXPSHIFTBITS_DP64, z); 397 return z; 398 } 399 #endif /* USE_INFINITY_WITH_FLAGS */ 400 401 #ifdef USE_INFINITYF_WITH_FLAGS 402 /* Returns a positive float infinity after raising the given flags, 403 e.g. infinityf_with_flags(AMD_F_OVERFLOW); 404 */ 405 static inline float infinityf_with_flags(int flags) 406 { 407 float z; 408 raise_fpsw_flags(flags); 409 PUT_BITS_SP32((BIASEDEMAX_SP32 + 1) << EXPSHIFTBITS_SP32, z); 410 return z; 411 } 412 #endif /* USE_INFINITYF_WITH_FLAGS */ 413 414 #if defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF) 415 #include <errno.h> 416 #endif 417 418 /* define the Microsoft specific error handling routine */ 419 double _handle_error( 420 char *fname, 421 int opcode, 422 unsigned long long value, 423 int type, 424 int flags, 425 int error, 426 double arg1, 427 double arg2, 428 int nargs 429 ); 430 float _handle_errorf( 431 char *fname, 432 int opcode, 433 unsigned long long value, 434 int type, 435 int flags, 436 int error, 437 float arg1, 438 float arg2, 439 int nargs 440 ); 441 442 #if defined(USE_SPLITEXP) 443 /* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2). 444 Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments 445 abs(x) > large/(ln(base)) (where large is the largest representable 446 floating point number) should be handled separately instead of calling 447 this function. This function is called by exp, exp2, exp10, 448 cosh and sinh. */ 449 static inline void splitexp(double x, double logbase, 450 double thirtytwo_by_logbaseof2, 451 double logbaseof2_by_32_lead, 452 double logbaseof2_by_32_trail, 453 int *m, double *z1, double *z2) 454 { 455 double q, r, r1, r2, f1, f2; 456 int n, j; 457 458 /* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain 459 leading and trailing parts respectively of precomputed 460 values of pow(2.0,j/32.0), for j = 0, 1, ..., 31. 461 two_to_jby32_lead_table contains the first 25 bits of precision, 462 and two_to_jby32_trail_table contains a further 53 bits precision. */ 463 464 static const double two_to_jby32_lead_table[32] = { 465 1.00000000000000000000e+00, /* 0x3ff0000000000000 */ 466 1.02189713716506958008e+00, /* 0x3ff059b0d0000000 */ 467 1.04427373409271240234e+00, /* 0x3ff0b55860000000 */ 468 1.06714040040969848633e+00, /* 0x3ff11301d0000000 */ 469 1.09050768613815307617e+00, /* 0x3ff172b830000000 */ 470 1.11438673734664916992e+00, /* 0x3ff1d48730000000 */ 471 1.13878858089447021484e+00, /* 0x3ff2387a60000000 */ 472 1.16372483968734741211e+00, /* 0x3ff29e9df0000000 */ 473 1.18920707702636718750e+00, /* 0x3ff306fe00000000 */ 474 1.21524733304977416992e+00, /* 0x3ff371a730000000 */ 475 1.24185776710510253906e+00, /* 0x3ff3dea640000000 */ 476 1.26905095577239990234e+00, /* 0x3ff44e0860000000 */ 477 1.29683953523635864258e+00, /* 0x3ff4bfdad0000000 */ 478 1.32523661851882934570e+00, /* 0x3ff5342b50000000 */ 479 1.35425549745559692383e+00, /* 0x3ff5ab07d0000000 */ 480 1.38390988111495971680e+00, /* 0x3ff6247eb0000000 */ 481 1.41421353816986083984e+00, /* 0x3ff6a09e60000000 */ 482 1.44518077373504638672e+00, /* 0x3ff71f75e0000000 */ 483 1.47682613134384155273e+00, /* 0x3ff7a11470000000 */ 484 1.50916439294815063477e+00, /* 0x3ff8258990000000 */ 485 1.54221081733703613281e+00, /* 0x3ff8ace540000000 */ 486 1.57598084211349487305e+00, /* 0x3ff93737b0000000 */ 487 1.61049032211303710938e+00, /* 0x3ff9c49180000000 */ 488 1.64575546979904174805e+00, /* 0x3ffa5503b0000000 */ 489 1.68179279565811157227e+00, /* 0x3ffae89f90000000 */ 490 1.71861928701400756836e+00, /* 0x3ffb7f76f0000000 */ 491 1.75625211000442504883e+00, /* 0x3ffc199bd0000000 */ 492 1.79470902681350708008e+00, /* 0x3ffcb720d0000000 */ 493 1.83400803804397583008e+00, /* 0x3ffd5818d0000000 */ 494 1.87416762113571166992e+00, /* 0x3ffdfc9730000000 */ 495 1.91520655155181884766e+00, /* 0x3ffea4afa0000000 */ 496 1.95714408159255981445e+00}; /* 0x3fff507650000000 */ 497 498 static const double two_to_jby32_trail_table[32] = { 499 0.00000000000000000000e+00, /* 0x0000000000000000 */ 500 1.14890470981563546737e-08, /* 0x3e48ac2ba1d73e2a */ 501 4.83347014379782142328e-08, /* 0x3e69f3121ec53172 */ 502 2.67125131841396124714e-10, /* 0x3df25b50a4ebbf1b */ 503 4.65271045830351350190e-08, /* 0x3e68faa2f5b9bef9 */ 504 5.24924336638693782574e-09, /* 0x3e368b9aa7805b80 */ 505 5.38622214388600821910e-08, /* 0x3e6ceac470cd83f6 */ 506 1.90902301017041969782e-08, /* 0x3e547f7b84b09745 */ 507 3.79763538792174980894e-08, /* 0x3e64636e2a5bd1ab */ 508 2.69306947081946450986e-08, /* 0x3e5ceaa72a9c5154 */ 509 4.49683815095311756138e-08, /* 0x3e682468446b6824 */ 510 1.41933332021066904914e-09, /* 0x3e18624b40c4dbd0 */ 511 1.94146510233556266402e-08, /* 0x3e54d8a89c750e5e */ 512 2.46409119489264118569e-08, /* 0x3e5a753e077c2a0f */ 513 4.94812958044698886494e-08, /* 0x3e6a90a852b19260 */ 514 8.48872238075784476136e-10, /* 0x3e0d2ac258f87d03 */ 515 2.42032342089579394887e-08, /* 0x3e59fcef32422cbf */ 516 3.32420002333182569170e-08, /* 0x3e61d8bee7ba46e2 */ 517 1.45956577586525322754e-08, /* 0x3e4f580c36bea881 */ 518 3.46452721050003920866e-08, /* 0x3e62999c25159f11 */ 519 8.07090469079979051284e-09, /* 0x3e415506dadd3e2a */ 520 2.99439161340839520436e-09, /* 0x3e29b8bc9e8a0388 */ 521 9.83621719880452147153e-09, /* 0x3e451f8480e3e236 */ 522 8.35492309647188080486e-09, /* 0x3e41f12ae45a1224 */ 523 3.48493175137966283582e-08, /* 0x3e62b5a75abd0e6a */ 524 1.11084703472699692902e-08, /* 0x3e47daf237553d84 */ 525 5.03688744342840346564e-08, /* 0x3e6b0aa538444196 */ 526 4.81896001063495806249e-08, /* 0x3e69df20d22a0798 */ 527 4.83653666334089557746e-08, /* 0x3e69f7490e4bb40b */ 528 1.29745882314081237628e-08, /* 0x3e4bdcdaf5cb4656 */ 529 9.84532844621636118964e-09, /* 0x3e452486cc2c7b9d */ 530 4.25828404545651943883e-08}; /* 0x3e66dc8a80ce9f09 */ 531 532 /* 533 Step 1. Reduce the argument. 534 535 To perform argument reduction, we find the integer n such that 536 x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64. 537 n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and 538 remainder by x - n*logbaseof2/32. The calculation of n is 539 straightforward whereas the computation of x - n*logbaseof2/32 540 must be carried out carefully. 541 logbaseof2/32 is so represented in two pieces that 542 (1) logbaseof2/32 is known to extra precision, (2) the product 543 of n and the leading piece is a model number and is hence 544 calculated without error, and (3) the subtraction of the value 545 obtained in (2) from x is a model number and is hence again 546 obtained without error. 547 */ 548 549 r = x * thirtytwo_by_logbaseof2; 550 /* Set n = nearest integer to r */ 551 /* This is faster on Hammer */ 552 if (r > 0) 553 n = (int)(r + 0.5); 554 else 555 n = (int)(r - 0.5); 556 557 r1 = x - n * logbaseof2_by_32_lead; 558 r2 = - n * logbaseof2_by_32_trail; 559 560 /* Set j = n mod 32: 5 mod 32 = 5, -5 mod 32 = 27, etc. */ 561 /* j = n % 32; 562 if (j < 0) j += 32; */ 563 j = n & 0x0000001f; 564 565 f1 = two_to_jby32_lead_table[j]; 566 f2 = two_to_jby32_trail_table[j]; 567 568 *m = (n - j) / 32; 569 570 /* Step 2. The following is the core approximation. We approximate 571 exp(r1+r2)-1 by a polynomial. */ 572 573 r1 *= logbase; r2 *= logbase; 574 575 r = r1 + r2; 576 q = r1 + (r2 + 577 r*r*( 5.00000000000000008883e-01 + 578 r*( 1.66666666665260878863e-01 + 579 r*( 4.16666666662260795726e-02 + 580 r*( 8.33336798434219616221e-03 + 581 r*( 1.38889490863777199667e-03 )))))); 582 583 /* Step 3. Function value reconstruction. 584 We now reconstruct the exponential of the input argument 585 so that exp(x) = 2**m * (z1 + z2). 586 The order of the computation below must be strictly observed. */ 587 588 *z1 = f1; 589 *z2 = f2 + ((f1 + f2) * q); 590 } 591 #endif /* USE_SPLITEXP */ 592 593 594 #if defined(USE_SPLITEXPF) 595 /* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2). 596 Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments 597 abs(x) > large/(ln(base)) (where large is the largest representable 598 floating point number) should be handled separately instead of calling 599 this function. This function is called by exp, exp2, exp10, 600 cosh and sinh. */ 601 static inline void splitexpf(float x, float logbase, 602 float thirtytwo_by_logbaseof2, 603 float logbaseof2_by_32_lead, 604 float logbaseof2_by_32_trail, 605 int *m, float *z1, float *z2) 606 { 607 float q, r, r1, r2, f1, f2; 608 int n, j; 609 610 /* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain 611 leading and trailing parts respectively of precomputed 612 values of pow(2.0,j/32.0), for j = 0, 1, ..., 31. 613 two_to_jby32_lead_table contains the first 10 bits of precision, 614 and two_to_jby32_trail_table contains a further 24 bits precision. */ 615 616 static const float two_to_jby32_lead_table[32] = { 617 1.0000000000E+00F, /* 0x3F800000 */ 618 1.0214843750E+00F, /* 0x3F82C000 */ 619 1.0429687500E+00F, /* 0x3F858000 */ 620 1.0664062500E+00F, /* 0x3F888000 */ 621 1.0898437500E+00F, /* 0x3F8B8000 */ 622 1.1132812500E+00F, /* 0x3F8E8000 */ 623 1.1386718750E+00F, /* 0x3F91C000 */ 624 1.1621093750E+00F, /* 0x3F94C000 */ 625 1.1875000000E+00F, /* 0x3F980000 */ 626 1.2148437500E+00F, /* 0x3F9B8000 */ 627 1.2402343750E+00F, /* 0x3F9EC000 */ 628 1.2675781250E+00F, /* 0x3FA24000 */ 629 1.2949218750E+00F, /* 0x3FA5C000 */ 630 1.3242187500E+00F, /* 0x3FA98000 */ 631 1.3535156250E+00F, /* 0x3FAD4000 */ 632 1.3828125000E+00F, /* 0x3FB10000 */ 633 1.4140625000E+00F, /* 0x3FB50000 */ 634 1.4433593750E+00F, /* 0x3FB8C000 */ 635 1.4765625000E+00F, /* 0x3FBD0000 */ 636 1.5078125000E+00F, /* 0x3FC10000 */ 637 1.5410156250E+00F, /* 0x3FC54000 */ 638 1.5742187500E+00F, /* 0x3FC98000 */ 639 1.6093750000E+00F, /* 0x3FCE0000 */ 640 1.6445312500E+00F, /* 0x3FD28000 */ 641 1.6816406250E+00F, /* 0x3FD74000 */ 642 1.7167968750E+00F, /* 0x3FDBC000 */ 643 1.7558593750E+00F, /* 0x3FE0C000 */ 644 1.7929687500E+00F, /* 0x3FE58000 */ 645 1.8339843750E+00F, /* 0x3FEAC000 */ 646 1.8730468750E+00F, /* 0x3FEFC000 */ 647 1.9140625000E+00F, /* 0x3FF50000 */ 648 1.9570312500E+00F}; /* 0x3FFA8000 */ 649 650 static const float two_to_jby32_trail_table[32] = { 651 0.0000000000E+00F, /* 0x00000000 */ 652 4.1277357377E-04F, /* 0x39D86988 */ 653 1.3050324051E-03F, /* 0x3AAB0D9F */ 654 7.3415064253E-04F, /* 0x3A407404 */ 655 6.6398258787E-04F, /* 0x3A2E0F1E */ 656 1.1054925853E-03F, /* 0x3A90E62D */ 657 1.1675967835E-04F, /* 0x38F4DCE0 */ 658 1.6154836630E-03F, /* 0x3AD3BEA3 */ 659 1.7071149778E-03F, /* 0x3ADFC146 */ 660 4.0360994171E-04F, /* 0x39D39B9C */ 661 1.6234370414E-03F, /* 0x3AD4C982 */ 662 1.4728321694E-03F, /* 0x3AC10C0C */ 663 1.9176795613E-03F, /* 0x3AFB5AA6 */ 664 1.0178930825E-03F, /* 0x3A856AD3 */ 665 7.3992193211E-04F, /* 0x3A41F752 */ 666 1.0973819299E-03F, /* 0x3A8FD607 */ 667 1.5106226783E-04F, /* 0x391E6678 */ 668 1.8214319134E-03F, /* 0x3AEEBD1D */ 669 2.6364589576E-04F, /* 0x398A39F4 */ 670 1.3519275235E-03F, /* 0x3AB13329 */ 671 1.1952003697E-03F, /* 0x3A9CA845 */ 672 1.7620950239E-03F, /* 0x3AE6F619 */ 673 1.1153318919E-03F, /* 0x3A923054 */ 674 1.2242280645E-03F, /* 0x3AA07647 */ 675 1.5220546629E-04F, /* 0x391F9958 */ 676 1.8224230735E-03F, /* 0x3AEEDE5F */ 677 3.9278529584E-04F, /* 0x39CDEEC0 */ 678 1.7403248930E-03F, /* 0x3AE41B9D */ 679 2.3711356334E-05F, /* 0x37C6E7C0 */ 680 1.1207590578E-03F, /* 0x3A92E66F */ 681 1.1440613307E-03F, /* 0x3A95F454 */ 682 1.1287408415E-04F}; /* 0x38ECB6D0 */ 683 684 /* 685 Step 1. Reduce the argument. 686 687 To perform argument reduction, we find the integer n such that 688 x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64. 689 n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and 690 remainder by x - n*logbaseof2/32. The calculation of n is 691 straightforward whereas the computation of x - n*logbaseof2/32 692 must be carried out carefully. 693 logbaseof2/32 is so represented in two pieces that 694 (1) logbaseof2/32 is known to extra precision, (2) the product 695 of n and the leading piece is a model number and is hence 696 calculated without error, and (3) the subtraction of the value 697 obtained in (2) from x is a model number and is hence again 698 obtained without error. 699 */ 700 701 r = x * thirtytwo_by_logbaseof2; 702 /* Set n = nearest integer to r */ 703 /* This is faster on Hammer */ 704 if (r > 0) 705 n = (int)(r + 0.5F); 706 else 707 n = (int)(r - 0.5F); 708 709 r1 = x - n * logbaseof2_by_32_lead; 710 r2 = - n * logbaseof2_by_32_trail; 711 712 /* Set j = n mod 32: 5 mod 32 = 5, -5 mod 32 = 27, etc. */ 713 /* j = n % 32; 714 if (j < 0) j += 32; */ 715 j = n & 0x0000001f; 716 717 f1 = two_to_jby32_lead_table[j]; 718 f2 = two_to_jby32_trail_table[j]; 719 720 *m = (n - j) / 32; 721 722 /* Step 2. The following is the core approximation. We approximate 723 exp(r1+r2)-1 by a polynomial. */ 724 725 r1 *= logbase; r2 *= logbase; 726 727 r = r1 + r2; 728 q = r1 + (r2 + 729 r*r*( 5.00000000000000008883e-01F + 730 r*( 1.66666666665260878863e-01F ))); 731 732 /* Step 3. Function value reconstruction. 733 We now reconstruct the exponential of the input argument 734 so that exp(x) = 2**m * (z1 + z2). 735 The order of the computation below must be strictly observed. */ 736 737 *z1 = f1; 738 *z2 = f2 + ((f1 + f2) * q); 739 } 740 #endif /* SPLITEXPF */ 741 742 743 #if defined(USE_SCALEUPDOUBLE1024) 744 /* Scales up a double (normal or denormal) whose bit pattern is given 745 as ux by 2**1024. There are no checks that the input number is 746 scalable by that amount. */ 747 static inline void scaleUpDouble1024(unsigned long long ux, unsigned long long *ur) 748 { 749 unsigned long long uy; 750 double y; 751 752 if ((ux & EXPBITS_DP64) == 0) 753 { 754 /* ux is denormalised */ 755 PUT_BITS_DP64(ux | 0x4010000000000000, y); 756 if (ux & SIGNBIT_DP64) 757 y += 4.0; 758 else 759 y -= 4.0; 760 GET_BITS_DP64(y, uy); 761 } 762 else 763 /* ux is normal */ 764 uy = ux + 0x4000000000000000; 765 766 *ur = uy; 767 return; 768 } 769 770 #endif /* SCALEUPDOUBLE1024 */ 771 772 773 #if defined(USE_SCALEDOWNDOUBLE) 774 /* Scales down a double whose bit pattern is given as ux by 2**k. 775 There are no checks that the input number is scalable by that amount. */ 776 static inline void scaleDownDouble(unsigned long long ux, int k, 777 unsigned long long *ur) 778 { 779 unsigned long long uy, uk, ax, xsign; 780 int n, shift; 781 xsign = ux & SIGNBIT_DP64; 782 ax = ux & ~SIGNBIT_DP64; 783 n = (int)((ax & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - k; 784 if (n > 0) 785 { 786 uk = (unsigned long long)n << EXPSHIFTBITS_DP64; 787 uy = (ax & ~EXPBITS_DP64) | uk; 788 } 789 else 790 { 791 uy = (ax & ~EXPBITS_DP64) | 0x0010000000000000; 792 shift = (1 - n); 793 if (shift > MANTLENGTH_DP64 + 1) 794 /* Sigh. Shifting works mod 64 so be careful not to shift too much */ 795 uy = 0; 796 else 797 { 798 /* Make sure we round the result */ 799 uy >>= shift - 1; 800 uy = (uy >> 1) + (uy & 1); 801 } 802 } 803 *ur = uy | xsign; 804 } 805 806 #endif /* SCALEDOWNDOUBLE */ 807 808 809 #if defined(USE_SCALEUPFLOAT128) 810 /* Scales up a float (normal or denormal) whose bit pattern is given 811 as ux by 2**128. There are no checks that the input number is 812 scalable by that amount. */ 813 static inline void scaleUpFloat128(unsigned int ux, unsigned int *ur) 814 { 815 unsigned int uy; 816 float y; 817 818 if ((ux & EXPBITS_SP32) == 0) 819 { 820 /* ux is denormalised */ 821 PUT_BITS_SP32(ux | 0x40800000, y); 822 /* Compensate for the implicit bit just added */ 823 if (ux & SIGNBIT_SP32) 824 y += 4.0F; 825 else 826 y -= 4.0F; 827 GET_BITS_SP32(y, uy); 828 } 829 else 830 /* ux is normal */ 831 uy = ux + 0x40000000; 832 *ur = uy; 833 } 834 #endif /* SCALEUPFLOAT128 */ 835 836 837 #if defined(USE_SCALEDOWNFLOAT) 838 /* Scales down a float whose bit pattern is given as ux by 2**k. 839 There are no checks that the input number is scalable by that amount. */ 840 static inline void scaleDownFloat(unsigned int ux, int k, 841 unsigned int *ur) 842 { 843 unsigned int uy, uk, ax, xsign; 844 int n, shift; 845 846 xsign = ux & SIGNBIT_SP32; 847 ax = ux & ~SIGNBIT_SP32; 848 n = ((ax & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - k; 849 if (n > 0) 850 { 851 uk = (unsigned int)n << EXPSHIFTBITS_SP32; 852 uy = (ax & ~EXPBITS_SP32) | uk; 853 } 854 else 855 { 856 uy = (ax & ~EXPBITS_SP32) | 0x00800000; 857 shift = (1 - n); 858 if (shift > MANTLENGTH_SP32 + 1) 859 /* Sigh. Shifting works mod 32 so be careful not to shift too much */ 860 uy = 0; 861 else 862 { 863 /* Make sure we round the result */ 864 uy >>= shift - 1; 865 uy = (uy >> 1) + (uy & 1); 866 } 867 } 868 *ur = uy | xsign; 869 } 870 #endif /* SCALEDOWNFLOAT */ 871 872 873 #if defined(USE_SQRT_AMD_INLINE) 874 static inline double sqrt_amd_inline(double x) 875 { 876 /* 877 Computes the square root of x. 878 879 The calculation is carried out in three steps. 880 881 Step 1. Reduction. 882 The input argument is scaled to the interval [1, 4) by 883 computing 884 x = 2^e * y, where y in [1,4). 885 Furthermore y is decomposed as y = c + t where 886 c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64. 887 888 Step 2. Approximation. 889 An approximation q = sqrt(1 + (t/c)) - 1 is obtained 890 from a basic series expansion using precomputed values 891 stored in rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl. 892 893 Step 3. Reconstruction. 894 The value of sqrt(x) is reconstructed via 895 sqrt(x) = 2^(e/2) * sqrt(y) 896 = 2^(e/2) * sqrt(c) * sqrt(y/c) 897 = 2^(e/2) * sqrt(c) * sqrt(1 + t/c) 898 = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ] 899 */ 900 901 unsigned long long ux, ax, u; 902 double r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail; 903 int e, denorm = 0, index; 904 905 /* Arrays rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl contain 906 leading and trailing parts respectively of precomputed 907 values of sqrt(j/32), for j = 32, 33, ..., 128. 908 rt_jby32_lead_table_dbl contains the first 21 bits of precision, 909 and rt_jby32_trail_table_dbl contains a further 53 bits precision. */ 910 911 static const double rt_jby32_lead_table_dbl[97] = { 912 1.00000000000000000000e+00, /* 0x3ff0000000000000 */ 913 1.01550388336181640625e+00, /* 0x3ff03f8100000000 */ 914 1.03077602386474609375e+00, /* 0x3ff07e0f00000000 */ 915 1.04582500457763671875e+00, /* 0x3ff0bbb300000000 */ 916 1.06065940856933593750e+00, /* 0x3ff0f87600000000 */ 917 1.07528972625732421875e+00, /* 0x3ff1346300000000 */ 918 1.08972454071044921875e+00, /* 0x3ff16f8300000000 */ 919 1.10396957397460937500e+00, /* 0x3ff1a9dc00000000 */ 920 1.11803340911865234375e+00, /* 0x3ff1e37700000000 */ 921 1.13192272186279296875e+00, /* 0x3ff21c5b00000000 */ 922 1.14564323425292968750e+00, /* 0x3ff2548e00000000 */ 923 1.15920162200927734375e+00, /* 0x3ff28c1700000000 */ 924 1.17260360717773437500e+00, /* 0x3ff2c2fc00000000 */ 925 1.18585395812988281250e+00, /* 0x3ff2f94200000000 */ 926 1.19895744323730468750e+00, /* 0x3ff32eee00000000 */ 927 1.21191978454589843750e+00, /* 0x3ff3640600000000 */ 928 1.22474479675292968750e+00, /* 0x3ff3988e00000000 */ 929 1.23743629455566406250e+00, /* 0x3ff3cc8a00000000 */ 930 1.25000000000000000000e+00, /* 0x3ff4000000000000 */ 931 1.26243782043457031250e+00, /* 0x3ff432f200000000 */ 932 1.27475452423095703125e+00, /* 0x3ff4656500000000 */ 933 1.28695297241210937500e+00, /* 0x3ff4975c00000000 */ 934 1.29903793334960937500e+00, /* 0x3ff4c8dc00000000 */ 935 1.31101036071777343750e+00, /* 0x3ff4f9e600000000 */ 936 1.32287502288818359375e+00, /* 0x3ff52a7f00000000 */ 937 1.33463478088378906250e+00, /* 0x3ff55aaa00000000 */ 938 1.34629058837890625000e+00, /* 0x3ff58a6800000000 */ 939 1.35784721374511718750e+00, /* 0x3ff5b9be00000000 */ 940 1.36930561065673828125e+00, /* 0x3ff5e8ad00000000 */ 941 1.38066959381103515625e+00, /* 0x3ff6173900000000 */ 942 1.39194107055664062500e+00, /* 0x3ff6456400000000 */ 943 1.40312099456787109375e+00, /* 0x3ff6732f00000000 */ 944 1.41421318054199218750e+00, /* 0x3ff6a09e00000000 */ 945 1.42521858215332031250e+00, /* 0x3ff6cdb200000000 */ 946 1.43614006042480468750e+00, /* 0x3ff6fa6e00000000 */ 947 1.44697952270507812500e+00, /* 0x3ff726d400000000 */ 948 1.45773792266845703125e+00, /* 0x3ff752e500000000 */ 949 1.46841716766357421875e+00, /* 0x3ff77ea300000000 */ 950 1.47901916503906250000e+00, /* 0x3ff7aa1000000000 */ 951 1.48954677581787109375e+00, /* 0x3ff7d52f00000000 */ 952 1.50000000000000000000e+00, /* 0x3ff8000000000000 */ 953 1.51038074493408203125e+00, /* 0x3ff82a8500000000 */ 954 1.52068996429443359375e+00, /* 0x3ff854bf00000000 */ 955 1.53093051910400390625e+00, /* 0x3ff87eb100000000 */ 956 1.54110336303710937500e+00, /* 0x3ff8a85c00000000 */ 957 1.55120849609375000000e+00, /* 0x3ff8d1c000000000 */ 958 1.56124877929687500000e+00, /* 0x3ff8fae000000000 */ 959 1.57122516632080078125e+00, /* 0x3ff923bd00000000 */ 960 1.58113861083984375000e+00, /* 0x3ff94c5800000000 */ 961 1.59099006652832031250e+00, /* 0x3ff974b200000000 */ 962 1.60078048706054687500e+00, /* 0x3ff99ccc00000000 */ 963 1.61051177978515625000e+00, /* 0x3ff9c4a800000000 */ 964 1.62018489837646484375e+00, /* 0x3ff9ec4700000000 */ 965 1.62979984283447265625e+00, /* 0x3ffa13a900000000 */ 966 1.63935947418212890625e+00, /* 0x3ffa3ad100000000 */ 967 1.64886283874511718750e+00, /* 0x3ffa61be00000000 */ 968 1.65831184387207031250e+00, /* 0x3ffa887200000000 */ 969 1.66770744323730468750e+00, /* 0x3ffaaeee00000000 */ 970 1.67705059051513671875e+00, /* 0x3ffad53300000000 */ 971 1.68634128570556640625e+00, /* 0x3ffafb4100000000 */ 972 1.69558238983154296875e+00, /* 0x3ffb211b00000000 */ 973 1.70477199554443359375e+00, /* 0x3ffb46bf00000000 */ 974 1.71391296386718750000e+00, /* 0x3ffb6c3000000000 */ 975 1.72300529479980468750e+00, /* 0x3ffb916e00000000 */ 976 1.73204994201660156250e+00, /* 0x3ffbb67a00000000 */ 977 1.74104785919189453125e+00, /* 0x3ffbdb5500000000 */ 978 1.75000000000000000000e+00, /* 0x3ffc000000000000 */ 979 1.75890541076660156250e+00, /* 0x3ffc247a00000000 */ 980 1.76776695251464843750e+00, /* 0x3ffc48c600000000 */ 981 1.77658367156982421875e+00, /* 0x3ffc6ce300000000 */ 982 1.78535652160644531250e+00, /* 0x3ffc90d200000000 */ 983 1.79408740997314453125e+00, /* 0x3ffcb49500000000 */ 984 1.80277538299560546875e+00, /* 0x3ffcd82b00000000 */ 985 1.81142139434814453125e+00, /* 0x3ffcfb9500000000 */ 986 1.82002735137939453125e+00, /* 0x3ffd1ed500000000 */ 987 1.82859230041503906250e+00, /* 0x3ffd41ea00000000 */ 988 1.83711719512939453125e+00, /* 0x3ffd64d500000000 */ 989 1.84560203552246093750e+00, /* 0x3ffd879600000000 */ 990 1.85404872894287109375e+00, /* 0x3ffdaa2f00000000 */ 991 1.86245727539062500000e+00, /* 0x3ffdcca000000000 */ 992 1.87082862854003906250e+00, /* 0x3ffdeeea00000000 */ 993 1.87916183471679687500e+00, /* 0x3ffe110c00000000 */ 994 1.88745784759521484375e+00, /* 0x3ffe330700000000 */ 995 1.89571857452392578125e+00, /* 0x3ffe54dd00000000 */ 996 1.90394306182861328125e+00, /* 0x3ffe768d00000000 */ 997 1.91213226318359375000e+00, /* 0x3ffe981800000000 */ 998 1.92028617858886718750e+00, /* 0x3ffeb97e00000000 */ 999 1.92840576171875000000e+00, /* 0x3ffedac000000000 */ 1000 1.93649101257324218750e+00, /* 0x3ffefbde00000000 */ 1001 1.94454288482666015625e+00, /* 0x3fff1cd900000000 */ 1002 1.95256233215332031250e+00, /* 0x3fff3db200000000 */ 1003 1.96054744720458984375e+00, /* 0x3fff5e6700000000 */ 1004 1.96850109100341796875e+00, /* 0x3fff7efb00000000 */ 1005 1.97642326354980468750e+00, /* 0x3fff9f6e00000000 */ 1006 1.98431301116943359375e+00, /* 0x3fffbfbf00000000 */ 1007 1.99217128753662109375e+00, /* 0x3fffdfef00000000 */ 1008 2.00000000000000000000e+00}; /* 0x4000000000000000 */ 1009 1010 static const double rt_jby32_trail_table_dbl[97] = { 1011 0.00000000000000000000e+00, /* 0x0000000000000000 */ 1012 9.17217678638807524014e-07, /* 0x3eaec6d70177881c */ 1013 3.82539669043705364790e-07, /* 0x3e99abfb41bd6b24 */ 1014 2.85899577162227138140e-08, /* 0x3e5eb2bf6bab55a2 */ 1015 7.63210485349101216659e-07, /* 0x3ea99bed9b2d8d0c */ 1016 9.32123004127716212874e-07, /* 0x3eaf46e029c1b296 */ 1017 1.95174719169309219157e-07, /* 0x3e8a3226fc42f30c */ 1018 5.34316371481845492427e-07, /* 0x3ea1edbe20701d73 */ 1019 5.79631242504454563052e-07, /* 0x3ea372fe94f82be7 */ 1020 4.20404384109571705948e-07, /* 0x3e9c367e08e7bb06 */ 1021 6.89486030314147010716e-07, /* 0x3ea722a3d0a66608 */ 1022 6.89927685625314560328e-07, /* 0x3ea7266f067ca1d6 */ 1023 3.32778123013641425828e-07, /* 0x3e965515a9b34850 */ 1024 1.64433259436999584387e-07, /* 0x3e8611e23ef6c1bd */ 1025 4.37590875197899335723e-07, /* 0x3e9d5dc1059ed8e7 */ 1026 1.79808183816018617413e-07, /* 0x3e88222982d0e4f4 */ 1027 7.46386593615986477624e-08, /* 0x3e7409212e7d0322 */ 1028 5.72520794105201454728e-07, /* 0x3ea335ea8a5fcf39 */ 1029 0.00000000000000000000e+00, /* 0x0000000000000000 */ 1030 2.96860689431670420344e-07, /* 0x3e93ec071e938bfe */ 1031 3.54167239176257065345e-07, /* 0x3e97c48bfd9862c6 */ 1032 7.95211265664474710063e-07, /* 0x3eaaaed010f74671 */ 1033 1.72327048595145565621e-07, /* 0x3e87211cbfeb62e0 */ 1034 6.99494915996239297020e-07, /* 0x3ea7789d9660e72d */ 1035 6.32644111701500844315e-07, /* 0x3ea53a5f1d36f1cf */ 1036 6.20124838851440463844e-10, /* 0x3e054eacff2057dc */ 1037 6.13404719757812629969e-07, /* 0x3ea4951b3e6a83cc */ 1038 3.47654909777986407387e-07, /* 0x3e9754aa76884c66 */ 1039 7.83106177002392475763e-07, /* 0x3eaa46d4b1de1074 */ 1040 5.33337372440526357008e-07, /* 0x3ea1e55548f92635 */ 1041 2.01508648555298681765e-08, /* 0x3e55a3070dd17788 */ 1042 5.25472356925843939587e-07, /* 0x3ea1a1c5eedb0801 */ 1043 3.81831102861301692797e-07, /* 0x3e999fcef32422cc */ 1044 6.99220602161420018738e-07, /* 0x3ea776425d6b0199 */ 1045 6.01209702477462624811e-07, /* 0x3ea42c5a1e0191a2 */ 1046 9.01437000591944740554e-08, /* 0x3e7832a0bdff1327 */ 1047 5.10428680864685379950e-08, /* 0x3e6b674743636676 */ 1048 3.47895267104621031421e-07, /* 0x3e9758cb90d2f714 */ 1049 7.80735841510641848628e-07, /* 0x3eaa3278459cde25 */ 1050 1.35158752025506517690e-07, /* 0x3e822404f4a103ee */ 1051 0.00000000000000000000e+00, /* 0x0000000000000000 */ 1052 1.76523947728535489812e-09, /* 0x3e1e539af6892ac5 */ 1053 6.68280121328499932183e-07, /* 0x3ea66c7b872c9cd0 */ 1054 5.70135482405123276616e-07, /* 0x3ea3216d2f43887d */ 1055 1.37705134737562525897e-07, /* 0x3e827b832cbedc0e */ 1056 7.09655107074516613672e-07, /* 0x3ea7cfe41579091d */ 1057 7.20302724551461693011e-07, /* 0x3ea82b5a713c490a */ 1058 4.69926266058212796694e-07, /* 0x3e9f8945932d872e */ 1059 2.19244345915999437026e-07, /* 0x3e8d6d2da9490251 */ 1060 1.91141411617401877927e-07, /* 0x3e89a791a3114e4a */ 1061 5.72297665296622053774e-07, /* 0x3ea333ffe005988d */ 1062 5.61055484436830560103e-07, /* 0x3ea2d36e0ed49ab1 */ 1063 2.76225500213991506100e-07, /* 0x3e92898498f55f9e */ 1064 7.58466189522395692908e-07, /* 0x3ea9732cca1032a3 */ 1065 1.56893371256836029827e-07, /* 0x3e850ed0b02a22d2 */ 1066 4.06038997708867066507e-07, /* 0x3e9b3fb265b1e40a */ 1067 5.51305629612057435809e-07, /* 0x3ea27fade682d1de */ 1068 5.64778487026561123207e-07, /* 0x3ea2f36906f707ba */ 1069 3.92609705553556897517e-07, /* 0x3e9a58fbbee883b6 */ 1070 9.09698438776943827802e-07, /* 0x3eae864005bca6d7 */ 1071 1.05949774066016139743e-07, /* 0x3e7c70d02300f263 */ 1072 7.16578798392844784244e-07, /* 0x3ea80b5d712d8e3e */ 1073 6.86233073531233972561e-07, /* 0x3ea706b27cc7d390 */ 1074 7.99211473033494452908e-07, /* 0x3eaad12c9d849a97 */ 1075 8.65552275731027456121e-07, /* 0x3ead0b09954e764b */ 1076 6.75456120386058448618e-07, /* 0x3ea6aa1fb7826cbd */ 1077 0.00000000000000000000e+00, /* 0x0000000000000000 */ 1078 4.99167184520462138743e-07, /* 0x3ea0bfd03f46763c */ 1079 4.51720373502110930296e-10, /* 0x3dff0abfb4adfb9e */ 1080 1.28874162718371367439e-07, /* 0x3e814c151f991b2e */ 1081 5.85529267186999798656e-07, /* 0x3ea3a5a879b09292 */ 1082 1.01827770937125531924e-07, /* 0x3e7b558d173f9796 */ 1083 2.54736389177809626508e-07, /* 0x3e9118567cd83fb8 */ 1084 6.98925535290464831294e-07, /* 0x3ea773b981896751 */ 1085 1.20940735036524314513e-07, /* 0x3e803b7df49f48a8 */ 1086 5.43759351196479689657e-08, /* 0x3e6d315f22491900 */ 1087 1.11957989042397958409e-07, /* 0x3e7e0db1c5bb84b2 */ 1088 8.47006714134442661218e-07, /* 0x3eac6bbb7644ff76 */ 1089 8.92831044643427836228e-07, /* 0x3eadf55c3afec01f */ 1090 7.77828292464916501663e-07, /* 0x3eaa197e81034da3 */ 1091 6.48469316302918797451e-08, /* 0x3e71683f4920555d */ 1092 2.12579816658859849140e-07, /* 0x3e8c882fd78bb0b0 */ 1093 7.61222472580559138435e-07, /* 0x3ea98ad9eb7b83ec */ 1094 2.86488961857314189607e-07, /* 0x3e9339d7c7777273 */ 1095 2.14637363790165363515e-07, /* 0x3e8ccee237cae6fe */ 1096 5.44137005612605847831e-08, /* 0x3e6d368fe324a146 */ 1097 2.58378284856442408413e-07, /* 0x3e9156e7b6d99b45 */ 1098 3.15848939061134843091e-07, /* 0x3e95323e5310b5c1 */ 1099 6.60530466255089632309e-07, /* 0x3ea629e9db362f5d */ 1100 7.63436345535852301127e-07, /* 0x3ea99dde4728d7ec */ 1101 8.68233432860324345268e-08, /* 0x3e774e746878544d */ 1102 9.45465175398023087082e-07, /* 0x3eafb97be873a87d */ 1103 8.77499534786171267246e-07, /* 0x3ead71a9e23c2f63 */ 1104 2.74055432394999316135e-07, /* 0x3e92643c89cda173 */ 1105 4.72129009349126213532e-07, /* 0x3e9faf1d57a4d56c */ 1106 8.93777032327078947306e-07, /* 0x3eadfd7c7ab7b282 */ 1107 0.00000000000000000000e+00}; /* 0x0000000000000000 */ 1108 1109 1110 /* Handle special arguments first */ 1111 1112 GET_BITS_DP64(x, ux); 1113 ax = ux & (~SIGNBIT_DP64); 1114 1115 if(ax >= 0x7ff0000000000000) 1116 { 1117 /* x is either NaN or infinity */ 1118 if (ux & MANTBITS_DP64) 1119 /* x is NaN */ 1120 return x + x; /* Raise invalid if it is a signalling NaN */ 1121 else if (ux & SIGNBIT_DP64) 1122 /* x is negative infinity */ 1123 return nan_with_flags(AMD_F_INVALID); 1124 else 1125 /* x is positive infinity */ 1126 return x; 1127 } 1128 else if (ux & SIGNBIT_DP64) 1129 { 1130 /* x is negative. */ 1131 if (ux == SIGNBIT_DP64) 1132 /* Handle negative zero first */ 1133 return x; 1134 else 1135 return nan_with_flags(AMD_F_INVALID); 1136 } 1137 else if (ux <= 0x000fffffffffffff) 1138 { 1139 /* x is denormalised or zero */ 1140 if (ux == 0) 1141 /* x is zero */ 1142 return x; 1143 else 1144 { 1145 /* x is denormalised; scale it up */ 1146 /* Normalize x by increasing the exponent by 60 1147 and subtracting a correction to account for the implicit 1148 bit. This replaces a slow denormalized 1149 multiplication by a fast normal subtraction. */ 1150 static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */ 1151 denorm = 1; 1152 GET_BITS_DP64(x, ux); 1153 PUT_BITS_DP64(ux | 0x03d0000000000000, x); 1154 x -= corr; 1155 GET_BITS_DP64(x, ux); 1156 } 1157 } 1158 1159 /* Main algorithm */ 1160 1161 /* 1162 Find y and e such that x = 2^e * y, where y in [1,4). 1163 This is done using an in-lined variant of splitDouble, 1164 which also ensures that e is even. 1165 */ 1166 y = x; 1167 ux &= EXPBITS_DP64; 1168 ux >>= EXPSHIFTBITS_DP64; 1169 if (ux & 1) 1170 { 1171 GET_BITS_DP64(y, u); 1172 u &= (SIGNBIT_DP64 | MANTBITS_DP64); 1173 u |= ONEEXPBITS_DP64; 1174 PUT_BITS_DP64(u, y); 1175 e = ux - EXPBIAS_DP64; 1176 } 1177 else 1178 { 1179 GET_BITS_DP64(y, u); 1180 u &= (SIGNBIT_DP64 | MANTBITS_DP64); 1181 u |= TWOEXPBITS_DP64; 1182 PUT_BITS_DP64(u, y); 1183 e = ux - EXPBIAS_DP64 - 1; 1184 } 1185 1186 1187 /* Find the index of the sub-interval of [1,4) in which y lies. */ 1188 1189 index = (int)(32.0*y+0.5); 1190 1191 /* Look up the table values and compute c and r = c/t */ 1192 1193 rtc_lead = rt_jby32_lead_table_dbl[index-32]; 1194 rtc_trail = rt_jby32_trail_table_dbl[index-32]; 1195 c = 0.03125*index; 1196 r = (y - c)/c; 1197 1198 /* 1199 Find q = sqrt(1+r) - 1. 1200 From one step of Newton on (q+1)^2 = 1+r 1201 */ 1202 1203 p = r*0.5 - r*r*(0.1250079870 - r*(0.6250522999E-01)); 1204 twop = p + p; 1205 q = p - (p*p + (twop - r))/(twop + 2.0); 1206 1207 /* Reconstruction */ 1208 1209 rtc = rtc_lead + rtc_trail; 1210 e >>= 1; /* e = e/2 */ 1211 z = rtc_lead + (rtc*q+rtc_trail); 1212 1213 if (denorm) 1214 { 1215 /* Scale by 2**(e-30) */ 1216 PUT_BITS_DP64(((long long)(e - 30) + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r); 1217 z *= r; 1218 } 1219 else 1220 { 1221 /* Scale by 2**e */ 1222 PUT_BITS_DP64(((long long)e + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r); 1223 z *= r; 1224 } 1225 1226 return z; 1227 1228 } 1229 #endif /* SQRT_AMD_INLINE */ 1230 1231 #if defined(USE_SQRTF_AMD_INLINE) 1232 1233 static inline float sqrtf_amd_inline(float x) 1234 { 1235 /* 1236 Computes the square root of x. 1237 1238 The calculation is carried out in three steps. 1239 1240 Step 1. Reduction. 1241 The input argument is scaled to the interval [1, 4) by 1242 computing 1243 x = 2^e * y, where y in [1,4). 1244 Furthermore y is decomposed as y = c + t where 1245 c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64. 1246 1247 Step 2. Approximation. 1248 An approximation q = sqrt(1 + (t/c)) - 1 is obtained 1249 from a basic series expansion using precomputed values 1250 stored in rt_jby32_lead_table_float and rt_jby32_trail_table_float. 1251 1252 Step 3. Reconstruction. 1253 The value of sqrt(x) is reconstructed via 1254 sqrt(x) = 2^(e/2) * sqrt(y) 1255 = 2^(e/2) * sqrt(c) * sqrt(y/c) 1256 = 2^(e/2) * sqrt(c) * sqrt(1 + t/c) 1257 = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ] 1258 */ 1259 1260 unsigned int ux, ax, u; 1261 float r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail; 1262 int e, denorm = 0, index; 1263 1264 /* Arrays rt_jby32_lead_table_float and rt_jby32_trail_table_float contain 1265 leading and trailing parts respectively of precomputed 1266 values of sqrt(j/32), for j = 32, 33, ..., 128. 1267 rt_jby32_lead_table_float contains the first 13 bits of precision, 1268 and rt_jby32_trail_table_float contains a further 24 bits precision. */ 1269 1270 static const float rt_jby32_lead_table_float[97] = { 1271 1.00000000000000000000e+00F, /* 0x3f800000 */ 1272 1.01538085937500000000e+00F, /* 0x3f81f800 */ 1273 1.03076171875000000000e+00F, /* 0x3f83f000 */ 1274 1.04565429687500000000e+00F, /* 0x3f85d800 */ 1275 1.06054687500000000000e+00F, /* 0x3f87c000 */ 1276 1.07519531250000000000e+00F, /* 0x3f89a000 */ 1277 1.08959960937500000000e+00F, /* 0x3f8b7800 */ 1278 1.10375976562500000000e+00F, /* 0x3f8d4800 */ 1279 1.11791992187500000000e+00F, /* 0x3f8f1800 */ 1280 1.13183593750000000000e+00F, /* 0x3f90e000 */ 1281 1.14550781250000000000e+00F, /* 0x3f92a000 */ 1282 1.15917968750000000000e+00F, /* 0x3f946000 */ 1283 1.17236328125000000000e+00F, /* 0x3f961000 */ 1284 1.18579101562500000000e+00F, /* 0x3f97c800 */ 1285 1.19873046875000000000e+00F, /* 0x3f997000 */ 1286 1.21191406250000000000e+00F, /* 0x3f9b2000 */ 1287 1.22460937500000000000e+00F, /* 0x3f9cc000 */ 1288 1.23730468750000000000e+00F, /* 0x3f9e6000 */ 1289 1.25000000000000000000e+00F, /* 0x3fa00000 */ 1290 1.26220703125000000000e+00F, /* 0x3fa19000 */ 1291 1.27465820312500000000e+00F, /* 0x3fa32800 */ 1292 1.28686523437500000000e+00F, /* 0x3fa4b800 */ 1293 1.29882812500000000000e+00F, /* 0x3fa64000 */ 1294 1.31079101562500000000e+00F, /* 0x3fa7c800 */ 1295 1.32275390625000000000e+00F, /* 0x3fa95000 */ 1296 1.33447265625000000000e+00F, /* 0x3faad000 */ 1297 1.34619140625000000000e+00F, /* 0x3fac5000 */ 1298 1.35766601562500000000e+00F, /* 0x3fadc800 */ 1299 1.36914062500000000000e+00F, /* 0x3faf4000 */ 1300 1.38061523437500000000e+00F, /* 0x3fb0b800 */ 1301 1.39184570312500000000e+00F, /* 0x3fb22800 */ 1302 1.40307617187500000000e+00F, /* 0x3fb39800 */ 1303 1.41406250000000000000e+00F, /* 0x3fb50000 */ 1304 1.42504882812500000000e+00F, /* 0x3fb66800 */ 1305 1.43603515625000000000e+00F, /* 0x3fb7d000 */ 1306 1.44677734375000000000e+00F, /* 0x3fb93000 */ 1307 1.45751953125000000000e+00F, /* 0x3fba9000 */ 1308 1.46826171875000000000e+00F, /* 0x3fbbf000 */ 1309 1.47900390625000000000e+00F, /* 0x3fbd5000 */ 1310 1.48950195312500000000e+00F, /* 0x3fbea800 */ 1311 1.50000000000000000000e+00F, /* 0x3fc00000 */ 1312 1.51025390625000000000e+00F, /* 0x3fc15000 */ 1313 1.52050781250000000000e+00F, /* 0x3fc2a000 */ 1314 1.53076171875000000000e+00F, /* 0x3fc3f000 */ 1315 1.54101562500000000000e+00F, /* 0x3fc54000 */ 1316 1.55102539062500000000e+00F, /* 0x3fc68800 */ 1317 1.56103515625000000000e+00F, /* 0x3fc7d000 */ 1318 1.57104492187500000000e+00F, /* 0x3fc91800 */ 1319 1.58105468750000000000e+00F, /* 0x3fca6000 */ 1320 1.59082031250000000000e+00F, /* 0x3fcba000 */ 1321 1.60058593750000000000e+00F, /* 0x3fcce000 */ 1322 1.61035156250000000000e+00F, /* 0x3fce2000 */ 1323 1.62011718750000000000e+00F, /* 0x3fcf6000 */ 1324 1.62963867187500000000e+00F, /* 0x3fd09800 */ 1325 1.63916015625000000000e+00F, /* 0x3fd1d000 */ 1326 1.64868164062500000000e+00F, /* 0x3fd30800 */ 1327 1.65820312500000000000e+00F, /* 0x3fd44000 */ 1328 1.66748046875000000000e+00F, /* 0x3fd57000 */ 1329 1.67700195312500000000e+00F, /* 0x3fd6a800 */ 1330 1.68627929687500000000e+00F, /* 0x3fd7d800 */ 1331 1.69555664062500000000e+00F, /* 0x3fd90800 */ 1332 1.70458984375000000000e+00F, /* 0x3fda3000 */ 1333 1.71386718750000000000e+00F, /* 0x3fdb6000 */ 1334 1.72290039062500000000e+00F, /* 0x3fdc8800 */ 1335 1.73193359375000000000e+00F, /* 0x3fddb000 */ 1336 1.74096679687500000000e+00F, /* 0x3fded800 */ 1337 1.75000000000000000000e+00F, /* 0x3fe00000 */ 1338 1.75878906250000000000e+00F, /* 0x3fe12000 */ 1339 1.76757812500000000000e+00F, /* 0x3fe24000 */ 1340 1.77636718750000000000e+00F, /* 0x3fe36000 */ 1341 1.78515625000000000000e+00F, /* 0x3fe48000 */ 1342 1.79394531250000000000e+00F, /* 0x3fe5a000 */ 1343 1.80273437500000000000e+00F, /* 0x3fe6c000 */ 1344 1.81127929687500000000e+00F, /* 0x3fe7d800 */ 1345 1.81982421875000000000e+00F, /* 0x3fe8f000 */ 1346 1.82836914062500000000e+00F, /* 0x3fea0800 */ 1347 1.83691406250000000000e+00F, /* 0x3feb2000 */ 1348 1.84545898437500000000e+00F, /* 0x3fec3800 */ 1349 1.85400390625000000000e+00F, /* 0x3fed5000 */ 1350 1.86230468750000000000e+00F, /* 0x3fee6000 */ 1351 1.87060546875000000000e+00F, /* 0x3fef7000 */ 1352 1.87915039062500000000e+00F, /* 0x3ff08800 */ 1353 1.88745117187500000000e+00F, /* 0x3ff19800 */ 1354 1.89550781250000000000e+00F, /* 0x3ff2a000 */ 1355 1.90380859375000000000e+00F, /* 0x3ff3b000 */ 1356 1.91210937500000000000e+00F, /* 0x3ff4c000 */ 1357 1.92016601562500000000e+00F, /* 0x3ff5c800 */ 1358 1.92822265625000000000e+00F, /* 0x3ff6d000 */ 1359 1.93627929687500000000e+00F, /* 0x3ff7d800 */ 1360 1.94433593750000000000e+00F, /* 0x3ff8e000 */ 1361 1.95239257812500000000e+00F, /* 0x3ff9e800 */ 1362 1.96044921875000000000e+00F, /* 0x3ffaf000 */ 1363 1.96826171875000000000e+00F, /* 0x3ffbf000 */ 1364 1.97631835937500000000e+00F, /* 0x3ffcf800 */ 1365 1.98413085937500000000e+00F, /* 0x3ffdf800 */ 1366 1.99194335937500000000e+00F, /* 0x3ffef800 */ 1367 2.00000000000000000000e+00F}; /* 0x40000000 */ 1368 1369 static const float rt_jby32_trail_table_float[97] = { 1370 0.00000000000000000000e+00F, /* 0x00000000 */ 1371 1.23941208585165441036e-04F, /* 0x3901f637 */ 1372 1.46876545841223560274e-05F, /* 0x37766aff */ 1373 1.70736297150142490864e-04F, /* 0x393307ad */ 1374 1.13296780909877270460e-04F, /* 0x38ed99bf */ 1375 9.53458802541717886925e-05F, /* 0x38c7f46e */ 1376 1.25126505736261606216e-04F, /* 0x39033464 */ 1377 2.10342666832730174065e-04F, /* 0x395c8f6e */ 1378 1.14066875539720058441e-04F, /* 0x38ef3730 */ 1379 8.72047676239162683487e-05F, /* 0x38b6e1b4 */ 1380 1.36111237225122749805e-04F, /* 0x390eb915 */ 1381 2.26244374061934649944e-05F, /* 0x37bdc99c */ 1382 2.40658700931817293167e-04F, /* 0x397c5954 */ 1383 6.31069415248930454254e-05F, /* 0x38845848 */ 1384 2.27412077947519719601e-04F, /* 0x396e7577 */ 1385 5.90185391047270968556e-06F, /* 0x36c6088a */ 1386 1.35496389702893793583e-04F, /* 0x390e1409 */ 1387 1.32179571664892137051e-04F, /* 0x390a99af */ 1388 0.00000000000000000000e+00F, /* 0x00000000 */ 1389 2.31086043640971183777e-04F, /* 0x39724fb0 */ 1390 9.66752704698592424393e-05F, /* 0x38cabe24 */ 1391 8.85332483449019491673e-05F, /* 0x38b9aaed */ 1392 2.09980673389509320259e-04F, /* 0x395c2e42 */ 1393 2.20044588786549866199e-04F, /* 0x3966bbc5 */ 1394 1.21749282698146998882e-04F, /* 0x38ff53a6 */ 1395 1.62125259521417319775e-04F, /* 0x392a002b */ 1396 9.97955357888713479042e-05F, /* 0x38d14952 */ 1397 1.81545779923908412457e-04F, /* 0x393e5d53 */ 1398 1.65768768056295812130e-04F, /* 0x392dd237 */ 1399 5.48927710042335093021e-05F, /* 0x38663caa */ 1400 9.53875860432162880898e-05F, /* 0x38c80ad2 */ 1401 4.53481625299900770187e-05F, /* 0x383e3438 */ 1402 1.51062369695864617825e-04F, /* 0x391e667f */ 1403 1.70453247847035527229e-04F, /* 0x3932bbb2 */ 1404 1.05505387182347476482e-04F, /* 0x38dd42c6 */ 1405 2.02269104192964732647e-04F, /* 0x39541833 */ 1406 2.18442466575652360916e-04F, /* 0x39650db4 */ 1407 1.55796806211583316326e-04F, /* 0x39235d63 */ 1408 1.60395247803535312414e-05F, /* 0x37868c9e */ 1409 4.49578510597348213196e-05F, /* 0x383c9120 */ 1410 0.00000000000000000000e+00F, /* 0x00000000 */ 1411 1.26840444863773882389e-04F, /* 0x39050079 */ 1412 1.82820076588541269302e-04F, /* 0x393fb364 */ 1413 1.69370483490638434887e-04F, /* 0x3931990b */ 1414 8.78757418831810355186e-05F, /* 0x38b849ee */ 1415 1.83815121999941766262e-04F, /* 0x3940be7f */ 1416 2.14343352126888930798e-04F, /* 0x3960c15b */ 1417 1.80714370799250900745e-04F, /* 0x393d7e25 */ 1418 8.41425862745381891727e-05F, /* 0x38b075b5 */ 1419 1.69945167726837098598e-04F, /* 0x3932334f */ 1420 1.95121858268976211548e-04F, /* 0x394c99a0 */ 1421 1.60778334247879683971e-04F, /* 0x3928969b */ 1422 6.79871009197086095810e-05F, /* 0x388e944c */ 1423 1.61929419846273958683e-04F, /* 0x3929cb99 */ 1424 1.99474830878898501396e-04F, /* 0x39512a1e */ 1425 1.81604162207804620266e-04F, /* 0x393e6cff */ 1426 1.09270178654696792364e-04F, /* 0x38e527fb */ 1427 2.27539261686615645885e-04F, /* 0x396e979b */ 1428 4.90300008095800876617e-05F, /* 0x384da590 */ 1429 6.28985289949923753738e-05F, /* 0x3883e864 */ 1430 2.58551553997676819563e-05F, /* 0x37d8e386 */ 1431 1.82868374395184218884e-04F, /* 0x393fc05b */ 1432 4.64625991298817098141e-05F, /* 0x3842e0d6 */ 1433 1.05703387816902250051e-04F, /* 0x38ddad13 */ 1434 1.17213814519345760345e-04F, /* 0x38f5d0b0 */ 1435 8.17377731436863541603e-05F, /* 0x38ab6aa2 */ 1436 0.00000000000000000000e+00F, /* 0x00000000 */ 1437 1.16847433673683553934e-04F, /* 0x38f50bfd */ 1438 1.88827965757809579372e-04F, /* 0x3946001f */ 1439 2.16612941585481166840e-04F, /* 0x39632298 */ 1440 2.00857131858356297016e-04F, /* 0x39529d2d */ 1441 1.42199307447299361229e-04F, /* 0x39151b56 */ 1442 4.12627305195201188326e-05F, /* 0x382d1185 */ 1443 1.42796401632949709892e-04F, /* 0x3915bb9e */ 1444 2.03253570361994206905e-04F, /* 0x39552077 */ 1445 2.23214170546270906925e-04F, /* 0x396a0e99 */ 1446 2.03244591830298304558e-04F, /* 0x39551e0e */ 1447 1.43898156238719820976e-04F, /* 0x3916e35e */ 1448 4.57155256299301981926e-05F, /* 0x383fbeac */ 1449 1.53365719597786664963e-04F, /* 0x3920d0cc */ 1450 2.23224633373320102692e-04F, /* 0x396a1168 */ 1451 1.16566716314991936088e-05F, /* 0x37439106 */ 1452 7.43694272387074306607e-06F, /* 0x36f98ada */ 1453 2.11048507480882108212e-04F, /* 0x395d4ce7 */ 1454 1.34682719362899661064e-04F, /* 0x390d399e */ 1455 2.29425968427676707506e-05F, /* 0x37c074da */ 1456 1.20421340398024767637e-04F, /* 0x38fc8ab7 */ 1457 1.83421318070031702518e-04F, /* 0x394054c9 */ 1458 2.12376224226318299770e-04F, /* 0x395eb14f */ 1459 2.07710763788782060146e-04F, /* 0x3959ccef */ 1460 1.69840845046564936638e-04F, /* 0x3932174e */ 1461 9.91739216260612010956e-05F, /* 0x38cffb98 */ 1462 2.40249748458154499531e-04F, /* 0x397beb8d */ 1463 1.05178231024183332920e-04F, /* 0x38dc9322 */ 1464 1.82623916771262884140e-04F, /* 0x393f7ebc */ 1465 2.28821940254420042038e-04F, /* 0x396fefec */ 1466 0.00000000000000000000e+00F}; /* 0x00000000 */ 1467 1468 1469 /* Handle special arguments first */ 1470 1471 GET_BITS_SP32(x, ux); 1472 ax = ux & (~SIGNBIT_SP32); 1473 1474 if(ax >= 0x7f800000) 1475 { 1476 /* x is either NaN or infinity */ 1477 if (ux & MANTBITS_SP32) 1478 /* x is NaN */ 1479 return x + x; /* Raise invalid if it is a signalling NaN */ 1480 else if (ux & SIGNBIT_SP32) 1481 return nanf_with_flags(AMD_F_INVALID); 1482 else 1483 /* x is positive infinity */ 1484 return x; 1485 } 1486 else if (ux & SIGNBIT_SP32) 1487 { 1488 /* x is negative. */ 1489 if (x == 0.0F) 1490 /* Handle negative zero first */ 1491 return x; 1492 else 1493 return nanf_with_flags(AMD_F_INVALID); 1494 } 1495 else if (ux <= 0x007fffff) 1496 { 1497 /* x is denormalised or zero */ 1498 if (ux == 0) 1499 /* x is zero */ 1500 return x; 1501 else 1502 { 1503 /* x is denormalised; scale it up */ 1504 /* Normalize x by increasing the exponent by 26 1505 and subtracting a correction to account for the implicit 1506 bit. This replaces a slow denormalized 1507 multiplication by a fast normal subtraction. */ 1508 static const float corr = 7.888609052210118054e-31F; /* 0x0d800000 */ 1509 denorm = 1; 1510 GET_BITS_SP32(x, ux); 1511 PUT_BITS_SP32(ux | 0x0d800000, x); 1512 x -= corr; 1513 GET_BITS_SP32(x, ux); 1514 } 1515 } 1516 1517 /* Main algorithm */ 1518 1519 /* 1520 Find y and e such that x = 2^e * y, where y in [1,4). 1521 This is done using an in-lined variant of splitFloat, 1522 which also ensures that e is even. 1523 */ 1524 y = x; 1525 ux &= EXPBITS_SP32; 1526 ux >>= EXPSHIFTBITS_SP32; 1527 if (ux & 1) 1528 { 1529 GET_BITS_SP32(y, u); 1530 u &= (SIGNBIT_SP32 | MANTBITS_SP32); 1531 u |= ONEEXPBITS_SP32; 1532 PUT_BITS_SP32(u, y); 1533 e = ux - EXPBIAS_SP32; 1534 } 1535 else 1536 { 1537 GET_BITS_SP32(y, u); 1538 u &= (SIGNBIT_SP32 | MANTBITS_SP32); 1539 u |= TWOEXPBITS_SP32; 1540 PUT_BITS_SP32(u, y); 1541 e = ux - EXPBIAS_SP32 - 1; 1542 } 1543 1544 /* Find the index of the sub-interval of [1,4) in which y lies. */ 1545 1546 index = (int)(32.0F*y+0.5); 1547 1548 /* Look up the table values and compute c and r = c/t */ 1549 1550 rtc_lead = rt_jby32_lead_table_float[index-32]; 1551 rtc_trail = rt_jby32_trail_table_float[index-32]; 1552 c = 0.03125F*index; 1553 r = (y - c)/c; 1554 1555 /* 1556 Find q = sqrt(1+r) - 1. 1557 From one step of Newton on (q+1)^2 = 1+r 1558 */ 1559 1560 p = r*0.5F - r*r*(0.1250079870F - r*(0.6250522999e-01F)); 1561 twop = p + p; 1562 q = p - (p*p + (twop - r))/(twop + 2.0); 1563 1564 /* Reconstruction */ 1565 1566 rtc = rtc_lead + rtc_trail; 1567 e >>= 1; /* e = e/2 */ 1568 z = rtc_lead + (rtc*q+rtc_trail); 1569 1570 if (denorm) 1571 { 1572 /* Scale by 2**(e-13) */ 1573 PUT_BITS_SP32(((e - 13) + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r); 1574 z *= r; 1575 } 1576 else 1577 { 1578 /* Scale by 2**e */ 1579 PUT_BITS_SP32((e + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r); 1580 z *= r; 1581 } 1582 1583 return z; 1584 1585 } 1586 #endif /* SQRTF_AMD_INLINE */ 1587 1588 #ifdef USE_LOG_KERNEL_AMD 1589 static inline void log_kernel_amd64(double x, unsigned long long ux, int *xexp, double *r1, double *r2) 1590 { 1591 1592 int expadjust; 1593 double r, z1, z2, correction, f, f1, f2, q, u, v, poly; 1594 int index; 1595 1596 /* 1597 Computes natural log(x). Algorithm based on: 1598 Ping-Tak Peter Tang 1599 "Table-driven implementation of the logarithm function in IEEE 1600 floating-point arithmetic" 1601 ACM Transactions on Mathematical Software (TOMS) 1602 Volume 16, Issue 4 (December 1990) 1603 */ 1604 1605 /* Arrays ln_lead_table and ln_tail_table contain 1606 leading and trailing parts respectively of precomputed 1607 values of natural log(1+i/64), for i = 0, 1, ..., 64. 1608 ln_lead_table contains the first 24 bits of precision, 1609 and ln_tail_table contains a further 53 bits precision. */ 1610 1611 static const double ln_lead_table[65] = { 1612 0.00000000000000000000e+00, /* 0x0000000000000000 */ 1613 1.55041813850402832031e-02, /* 0x3f8fc0a800000000 */ 1614 3.07716131210327148438e-02, /* 0x3f9f829800000000 */ 1615 4.58095073699951171875e-02, /* 0x3fa7745800000000 */ 1616 6.06245994567871093750e-02, /* 0x3faf0a3000000000 */ 1617 7.52233862876892089844e-02, /* 0x3fb341d700000000 */ 1618 8.96121263504028320312e-02, /* 0x3fb6f0d200000000 */ 1619 1.03796780109405517578e-01, /* 0x3fba926d00000000 */ 1620 1.17783010005950927734e-01, /* 0x3fbe270700000000 */ 1621 1.31576299667358398438e-01, /* 0x3fc0d77e00000000 */ 1622 1.45181953907012939453e-01, /* 0x3fc2955280000000 */ 1623 1.58604979515075683594e-01, /* 0x3fc44d2b00000000 */ 1624 1.71850204467773437500e-01, /* 0x3fc5ff3000000000 */ 1625 1.84922337532043457031e-01, /* 0x3fc7ab8900000000 */ 1626 1.97825729846954345703e-01, /* 0x3fc9525a80000000 */ 1627 2.10564732551574707031e-01, /* 0x3fcaf3c900000000 */ 1628 2.23143517971038818359e-01, /* 0x3fcc8ff780000000 */ 1629 2.35566020011901855469e-01, /* 0x3fce270700000000 */ 1630 2.47836112976074218750e-01, /* 0x3fcfb91800000000 */ 1631 2.59957492351531982422e-01, /* 0x3fd0a324c0000000 */ 1632 2.71933674812316894531e-01, /* 0x3fd1675c80000000 */ 1633 2.83768117427825927734e-01, /* 0x3fd22941c0000000 */ 1634 2.95464158058166503906e-01, /* 0x3fd2e8e280000000 */ 1635 3.07025015354156494141e-01, /* 0x3fd3a64c40000000 */ 1636 3.18453729152679443359e-01, /* 0x3fd4618bc0000000 */ 1637 3.29753279685974121094e-01, /* 0x3fd51aad80000000 */ 1638 3.40926527976989746094e-01, /* 0x3fd5d1bd80000000 */ 1639 3.51976394653320312500e-01, /* 0x3fd686c800000000 */ 1640 3.62905442714691162109e-01, /* 0x3fd739d7c0000000 */ 1641 3.73716354370117187500e-01, /* 0x3fd7eaf800000000 */ 1642 3.84411692619323730469e-01, /* 0x3fd89a3380000000 */ 1643 3.94993782043457031250e-01, /* 0x3fd9479400000000 */ 1644 4.05465066432952880859e-01, /* 0x3fd9f323c0000000 */ 1645 4.15827870368957519531e-01, /* 0x3fda9cec80000000 */ 1646 4.26084339618682861328e-01, /* 0x3fdb44f740000000 */ 1647 4.36236739158630371094e-01, /* 0x3fdbeb4d80000000 */ 1648 4.46287095546722412109e-01, /* 0x3fdc8ff7c0000000 */ 1649 4.56237375736236572266e-01, /* 0x3fdd32fe40000000 */ 1650 4.66089725494384765625e-01, /* 0x3fddd46a00000000 */ 1651 4.75845873355865478516e-01, /* 0x3fde744240000000 */ 1652 4.85507786273956298828e-01, /* 0x3fdf128f40000000 */ 1653 4.95077252388000488281e-01, /* 0x3fdfaf5880000000 */ 1654 5.04556000232696533203e-01, /* 0x3fe02552a0000000 */ 1655 5.13945698738098144531e-01, /* 0x3fe0723e40000000 */ 1656 5.23248136043548583984e-01, /* 0x3fe0be72e0000000 */ 1657 5.32464742660522460938e-01, /* 0x3fe109f380000000 */ 1658 5.41597247123718261719e-01, /* 0x3fe154c3c0000000 */ 1659 5.50647079944610595703e-01, /* 0x3fe19ee6a0000000 */ 1660 5.59615731239318847656e-01, /* 0x3fe1e85f40000000 */ 1661 5.68504691123962402344e-01, /* 0x3fe23130c0000000 */ 1662 5.77315330505371093750e-01, /* 0x3fe2795e00000000 */ 1663 5.86049020290374755859e-01, /* 0x3fe2c0e9e0000000 */ 1664 5.94707071781158447266e-01, /* 0x3fe307d720000000 */ 1665 6.03290796279907226562e-01, /* 0x3fe34e2880000000 */ 1666 6.11801505088806152344e-01, /* 0x3fe393e0c0000000 */ 1667 6.20240390300750732422e-01, /* 0x3fe3d90260000000 */ 1668 6.28608644008636474609e-01, /* 0x3fe41d8fe0000000 */ 1669 6.36907458305358886719e-01, /* 0x3fe4618bc0000000 */ 1670 6.45137906074523925781e-01, /* 0x3fe4a4f840000000 */ 1671 6.53301239013671875000e-01, /* 0x3fe4e7d800000000 */ 1672 6.61398470401763916016e-01, /* 0x3fe52a2d20000000 */ 1673 6.69430613517761230469e-01, /* 0x3fe56bf9c0000000 */ 1674 6.77398800849914550781e-01, /* 0x3fe5ad4040000000 */ 1675 6.85303986072540283203e-01, /* 0x3fe5ee02a0000000 */ 1676 6.93147122859954833984e-01}; /* 0x3fe62e42e0000000 */ 1677 1678 static const double ln_tail_table[65] = { 1679 0.00000000000000000000e+00, /* 0x0000000000000000 */ 1680 5.15092497094772879206e-09, /* 0x3e361f807c79f3db */ 1681 4.55457209735272790188e-08, /* 0x3e6873c1980267c8 */ 1682 2.86612990859791781788e-08, /* 0x3e5ec65b9f88c69e */ 1683 2.23596477332056055352e-08, /* 0x3e58022c54cc2f99 */ 1684 3.49498983167142274770e-08, /* 0x3e62c37a3a125330 */ 1685 3.23392843005887000414e-08, /* 0x3e615cad69737c93 */ 1686 1.35722380472479366661e-08, /* 0x3e4d256ab1b285e9 */ 1687 2.56504325268044191098e-08, /* 0x3e5b8abcb97a7aa2 */ 1688 5.81213608741512136843e-08, /* 0x3e6f34239659a5dc */ 1689 5.59374849578288093334e-08, /* 0x3e6e07fd48d30177 */ 1690 5.06615629004996189970e-08, /* 0x3e6b32df4799f4f6 */ 1691 5.24588857848400955725e-08, /* 0x3e6c29e4f4f21cf8 */ 1692 9.61968535632653505972e-10, /* 0x3e1086c848df1b59 */ 1693 1.34829655346594463137e-08, /* 0x3e4cf456b4764130 */ 1694 3.65557749306383026498e-08, /* 0x3e63a02ffcb63398 */ 1695 3.33431709374069198903e-08, /* 0x3e61e6a6886b0976 */ 1696 5.13008650536088382197e-08, /* 0x3e6b8abcb97a7aa2 */ 1697 5.09285070380306053751e-08, /* 0x3e6b578f8aa35552 */ 1698 3.20853940845502057341e-08, /* 0x3e6139c871afb9fc */ 1699 4.06713248643004200446e-08, /* 0x3e65d5d30701ce64 */ 1700 5.57028186706125221168e-08, /* 0x3e6de7bcb2d12142 */ 1701 5.48356693724804282546e-08, /* 0x3e6d708e984e1664 */ 1702 1.99407553679345001938e-08, /* 0x3e556945e9c72f36 */ 1703 1.96585517245087232086e-09, /* 0x3e20e2f613e85bda */ 1704 6.68649386072067321503e-09, /* 0x3e3cb7e0b42724f6 */ 1705 5.89936034642113390002e-08, /* 0x3e6fac04e52846c7 */ 1706 2.85038578721554472484e-08, /* 0x3e5e9b14aec442be */ 1707 5.09746772910284482606e-08, /* 0x3e6b5de8034e7126 */ 1708 5.54234668933210171467e-08, /* 0x3e6dc157e1b259d3 */ 1709 6.29100830926604004874e-09, /* 0x3e3b05096ad69c62 */ 1710 2.61974119468563937716e-08, /* 0x3e5c2116faba4cdd */ 1711 4.16752115011186398935e-08, /* 0x3e665fcc25f95b47 */ 1712 2.47747534460820790327e-08, /* 0x3e5a9a08498d4850 */ 1713 5.56922172017964209793e-08, /* 0x3e6de647b1465f77 */ 1714 2.76162876992552906035e-08, /* 0x3e5da71b7bf7861d */ 1715 7.08169709942321478061e-09, /* 0x3e3e6a6886b09760 */ 1716 5.77453510221151779025e-08, /* 0x3e6f0075eab0ef64 */ 1717 4.43021445893361960146e-09, /* 0x3e33071282fb989b */ 1718 3.15140984357495864573e-08, /* 0x3e60eb43c3f1bed2 */ 1719 2.95077445089736670973e-08, /* 0x3e5faf06ecb35c84 */ 1720 1.44098510263167149349e-08, /* 0x3e4ef1e63db35f68 */ 1721 1.05196987538551827693e-08, /* 0x3e469743fb1a71a5 */ 1722 5.23641361722697546261e-08, /* 0x3e6c1cdf404e5796 */ 1723 7.72099925253243069458e-09, /* 0x3e4094aa0ada625e */ 1724 5.62089493829364197156e-08, /* 0x3e6e2d4c96fde3ec */ 1725 3.53090261098577946927e-08, /* 0x3e62f4d5e9a98f34 */ 1726 3.80080516835568242269e-08, /* 0x3e6467c96ecc5cbe */ 1727 5.66961038386146408282e-08, /* 0x3e6e7040d03dec5a */ 1728 4.42287063097349852717e-08, /* 0x3e67bebf4282de36 */ 1729 3.45294525105681104660e-08, /* 0x3e6289b11aeb783f */ 1730 2.47132034530447431509e-08, /* 0x3e5a891d1772f538 */ 1731 3.59655343422487209774e-08, /* 0x3e634f10be1fb591 */ 1732 5.51581770357780862071e-08, /* 0x3e6d9ce1d316eb93 */ 1733 3.60171867511861372793e-08, /* 0x3e63562a19a9c442 */ 1734 1.94511067964296180547e-08, /* 0x3e54e2adf548084c */ 1735 1.54137376631349347838e-08, /* 0x3e508ce55cc8c97a */ 1736 3.93171034490174464173e-09, /* 0x3e30e2f613e85bda */ 1737 5.52990607758839766440e-08, /* 0x3e6db03ebb0227bf */ 1738 3.29990737637586136511e-08, /* 0x3e61b75bb09cb098 */ 1739 1.18436010922446096216e-08, /* 0x3e496f16abb9df22 */ 1740 4.04248680368301346709e-08, /* 0x3e65b3f399411c62 */ 1741 2.27418915900284316293e-08, /* 0x3e586b3e59f65355 */ 1742 1.70263791333409206020e-08, /* 0x3e52482ceae1ac12 */ 1743 5.76999904754328540596e-08}; /* 0x3e6efa39ef35793c */ 1744 1745 /* Approximating polynomial coefficients for x near 1.0 */ 1746 static const double 1747 ca_1 = 8.33333333333317923934e-02, /* 0x3fb55555555554e6 */ 1748 ca_2 = 1.25000000037717509602e-02, /* 0x3f89999999bac6d4 */ 1749 ca_3 = 2.23213998791944806202e-03, /* 0x3f62492307f1519f */ 1750 ca_4 = 4.34887777707614552256e-04; /* 0x3f3c8034c85dfff0 */ 1751 1752 /* Approximating polynomial coefficients for other x */ 1753 static const double 1754 cb_1 = 8.33333333333333593622e-02, /* 0x3fb5555555555557 */ 1755 cb_2 = 1.24999999978138668903e-02, /* 0x3f89999999865ede */ 1756 cb_3 = 2.23219810758559851206e-03; /* 0x3f6249423bd94741 */ 1757 1758 static const unsigned long long 1759 log_thresh1 = 0x3fee0faa00000000, 1760 log_thresh2 = 0x3ff1082c00000000; 1761 1762 /* log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000 1763 log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 */ 1764 if (ux >= log_thresh1 && ux <= log_thresh2) 1765 { 1766 /* Arguments close to 1.0 are handled separately to maintain 1767 accuracy. 1768 1769 The approximation in this region exploits the identity 1770 log( 1 + r ) = log( 1 + u/2 ) / log( 1 - u/2 ), where 1771 u = 2r / (2+r). 1772 Note that the right hand side has an odd Taylor series expansion 1773 which converges much faster than the Taylor series expansion of 1774 log( 1 + r ) in r. Thus, we approximate log( 1 + r ) by 1775 u + A1 * u^3 + A2 * u^5 + ... + An * u^(2n+1). 1776 1777 One subtlety is that since u cannot be calculated from 1778 r exactly, the rounding error in the first u should be 1779 avoided if possible. To accomplish this, we observe that 1780 u = r - r*r/(2+r). 1781 Since x (=1+r) is the input argument, and thus presumed exact, 1782 the formula above approximates u accurately because 1783 u = r - correction, 1784 and the magnitude of "correction" (of the order of r*r) 1785 is small. 1786 With these observations, we will approximate log( 1 + r ) by 1787 r + ( (A1*u^3 + ... + An*u^(2n+1)) - correction ). 1788 1789 We approximate log(1+r) by an odd polynomial in u, where 1790 u = 2r/(2+r) = r - r*r/(2+r). 1791 */ 1792 r = x - 1.0; 1793 u = r / (2.0 + r); 1794 correction = r * u; 1795 u = u + u; 1796 v = u * u; 1797 z1 = r; 1798 z2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction); 1799 *r1 = z1; 1800 *r2 = z2; 1801 *xexp = 0; 1802 } 1803 else 1804 { 1805 /* 1806 First, we decompose the argument x to the form 1807 x = 2**M * (F1 + F2), 1808 where 1 <= F1+F2 < 2, M has the value of an integer, 1809 F1 = 1 + j/64, j ranges from 0 to 64, and |F2| <= 1/128. 1810 1811 Second, we approximate log( 1 + F2/F1 ) by an odd polynomial 1812 in U, where U = 2 F2 / (2 F2 + F1). 1813 Note that log( 1 + F2/F1 ) = log( 1 + U/2 ) - log( 1 - U/2 ). 1814 The core approximation calculates 1815 Poly = [log( 1 + U/2 ) - log( 1 - U/2 )]/U - 1. 1816 Note that log(1 + U/2) - log(1 - U/2) = 2 arctanh ( U/2 ), 1817 thus, Poly = 2 arctanh( U/2 ) / U - 1. 1818 1819 It is not hard to see that 1820 log(x) = M*log(2) + log(F1) + log( 1 + F2/F1 ). 1821 Hence, we return Z1 = log(F1), and Z2 = log( 1 + F2/F1). 1822 The values of log(F1) are calculated beforehand and stored 1823 in the program. 1824 */ 1825 1826 f = x; 1827 if (ux < IMPBIT_DP64) 1828 { 1829 /* The input argument x is denormalized */ 1830 /* Normalize f by increasing the exponent by 60 1831 and subtracting a correction to account for the implicit 1832 bit. This replaces a slow denormalized 1833 multiplication by a fast normal subtraction. */ 1834 static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */ 1835 GET_BITS_DP64(f, ux); 1836 ux |= 0x03d0000000000000; 1837 PUT_BITS_DP64(ux, f); 1838 f -= corr; 1839 GET_BITS_DP64(f, ux); 1840 expadjust = 60; 1841 } 1842 else 1843 expadjust = 0; 1844 1845 /* Store the exponent of x in xexp and put 1846 f into the range [0.5,1) */ 1847 *xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 - expadjust; 1848 PUT_BITS_DP64((ux & MANTBITS_DP64) | HALFEXPBITS_DP64, f); 1849 1850 /* Now x = 2**xexp * f, 1/2 <= f < 1. */ 1851 1852 /* Set index to be the nearest integer to 128*f */ 1853 r = 128.0 * f; 1854 index = (int)(r + 0.5); 1855 1856 z1 = ln_lead_table[index-64]; 1857 q = ln_tail_table[index-64]; 1858 f1 = index * 0.0078125; /* 0.0078125 = 1/128 */ 1859 f2 = f - f1; 1860 /* At this point, x = 2**xexp * ( f1 + f2 ) where 1861 f1 = j/128, j = 64, 65, ..., 128 and |f2| <= 1/256. */ 1862 1863 /* Calculate u = 2 f2 / ( 2 f1 + f2 ) = f2 / ( f1 + 0.5*f2 ) */ 1864 /* u = f2 / (f1 + 0.5 * f2); */ 1865 u = f2 / (f1 + 0.5 * f2); 1866 1867 /* Here, |u| <= 2(exp(1/16)-1) / (exp(1/16)+1). 1868 The core approximation calculates 1869 poly = [log(1 + u/2) - log(1 - u/2)]/u - 1 */ 1870 v = u * u; 1871 poly = (v * (cb_1 + v * (cb_2 + v * cb_3))); 1872 z2 = q + (u + u * poly); 1873 *r1 = z1; 1874 *r2 = z2; 1875 } 1876 return; 1877 } 1878 #endif /* USE_LOG_KERNEL_AMD */ 1879 1880 #if defined(USE_REMAINDER_PIBY2F_INLINE) 1881 /* Define this to get debugging print statements activated */ 1882 #define DEBUGGING_PRINT 1883 #undef DEBUGGING_PRINT 1884 1885 1886 #ifdef DEBUGGING_PRINT 1887 #include <stdio.h> 1888 char *d2b(long long d, int bitsper, int point) 1889 { 1890 static char buff[200]; 1891 int i, j; 1892 j = bitsper; 1893 if (point >= 0 && point <= bitsper) 1894 j++; 1895 buff[j] = '\0'; 1896 for (i = bitsper - 1; i >= 0; i--) 1897 { 1898 j--; 1899 if (d % 2 == 1) 1900 buff[j] = '1'; 1901 else 1902 buff[j] = '0'; 1903 if (i == point) 1904 { 1905 j--; 1906 buff[j] = '.'; 1907 } 1908 d /= 2; 1909 } 1910 return buff; 1911 } 1912 #endif 1913 1914 /* Given positive argument x, reduce it to the range [-pi/4,pi/4] using 1915 extra precision, and return the result in r. 1916 Return value "region" tells how many lots of pi/2 were subtracted 1917 from x to put it in the range [-pi/4,pi/4], mod 4. */ 1918 static inline void __remainder_piby2f_inline(unsigned long long ux, double *r, int *region) 1919 { 1920 1921 /* This method simulates multi-precision floating-point 1922 arithmetic and is accurate for all 1 <= x < infinity */ 1923 #if 0 1924 const int bitsper = 36; 1925 #else 1926 #define bitsper 36 1927 #endif 1928 unsigned long long res[10]; 1929 unsigned long long u, carry, mask, mant, nextbits; 1930 int first, last, i, rexp, xexp, resexp, ltb, determ, bc; 1931 double dx; 1932 static const double 1933 piby2 = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */ 1934 static unsigned long long pibits[] = 1935 { 1936 0LL, 1937 5215LL, 13000023176LL, 11362338026LL, 67174558139LL, 1938 34819822259LL, 10612056195LL, 67816420731LL, 57840157550LL, 1939 19558516809LL, 50025467026LL, 25186875954LL, 18152700886LL 1940 }; 1941 1942 xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64); 1943 ux = ((ux & MANTBITS_DP64) | IMPBIT_DP64) >> 29; 1944 1945 1946 /* Now ux is the mantissa bit pattern of x as a long long integer */ 1947 mask = 1; 1948 mask = (mask << bitsper) - 1; 1949 1950 /* Set first and last to the positions of the first 1951 and last chunks of 2/pi that we need */ 1952 first = xexp / bitsper; 1953 resexp = xexp - first * bitsper; 1954 /* 120 is the theoretical maximum number of bits (actually 1955 115 for IEEE single precision) that we need to extract 1956 from the middle of 2/pi to compute the reduced argument 1957 accurately enough for our purposes */ 1958 last = first + 120 / bitsper; 1959 1960 #ifdef DEBUGGING_PRINT 1961 printf("first = %d, last = %d\n", first, last); 1962 #endif 1963 1964 /* Do a long multiplication of the bits of 2/pi by the 1965 integer mantissa */ 1966 #if 0 1967 for (i = last; i >= first; i--) 1968 { 1969 u = pibits[i] * ux + carry; 1970 res[i - first] = u & mask; 1971 carry = u >> bitsper; 1972 } 1973 res[last - first + 1] = 0; 1974 #else 1975 /* Unroll the loop. This is only correct because we know 1976 that bitsper is fixed as 36. */ 1977 res[4] = 0; 1978 u = pibits[last] * ux; 1979 res[3] = u & mask; 1980 carry = u >> bitsper; 1981 u = pibits[last - 1] * ux + carry; 1982 res[2] = u & mask; 1983 carry = u >> bitsper; 1984 u = pibits[last - 2] * ux + carry; 1985 res[1] = u & mask; 1986 carry = u >> bitsper; 1987 u = pibits[first] * ux + carry; 1988 res[0] = u & mask; 1989 #endif 1990 1991 #ifdef DEBUGGING_PRINT 1992 printf("resexp = %d\n", resexp); 1993 printf("Significant part of x * 2/pi with binary" 1994 " point in correct place:\n"); 1995 for (i = 0; i <= last - first; i++) 1996 { 1997 if (i > 0 && i % 5 == 0) 1998 printf("\n "); 1999 if (i == 1) 2000 printf("%s ", d2b(res[i], bitsper, resexp)); 2001 else 2002 printf("%s ", d2b(res[i], bitsper, -1)); 2003 } 2004 printf("\n"); 2005 #endif 2006 2007 /* Reconstruct the result */ 2008 ltb = (int)((((res[0] << bitsper) | res[1]) 2009 >> (bitsper - 1 - resexp)) & 7); 2010 2011 /* determ says whether the fractional part is >= 0.5 */ 2012 determ = ltb & 1; 2013 2014 #ifdef DEBUGGING_PRINT 2015 printf("ltb = %d (last two bits before binary point" 2016 " and first bit after)\n", ltb); 2017 printf("determ = %d (1 means need to negate because the fractional\n" 2018 " part of x * 2/pi is greater than 0.5)\n", determ); 2019 #endif 2020 2021 i = 1; 2022 if (determ) 2023 { 2024 /* The mantissa is >= 0.5. We want to subtract it 2025 from 1.0 by negating all the bits */ 2026 *region = ((ltb >> 1) + 1) & 3; 2027 mant = 1; 2028 mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1); 2029 while (mant < 0x0000000000010000) 2030 { 2031 i++; 2032 mant = (mant << bitsper) | (~(res[i]) & mask); 2033 } 2034 nextbits = (~(res[i+1]) & mask); 2035 } 2036 else 2037 { 2038 *region = (ltb >> 1); 2039 mant = 1; 2040 mant = res[1] & ((mant << (bitsper - resexp)) - 1); 2041 while (mant < 0x0000000000010000) 2042 { 2043 i++; 2044 mant = (mant << bitsper) | res[i]; 2045 } 2046 nextbits = res[i+1]; 2047 } 2048 2049 #ifdef DEBUGGING_PRINT 2050 printf("First bits of mant = %s\n", d2b(mant, bitsper, -1)); 2051 #endif 2052 2053 /* Normalize the mantissa. The shift value 6 here, determined by 2054 trial and error, seems to give optimal speed. */ 2055 bc = 0; 2056 while (mant < 0x0000400000000000) 2057 { 2058 bc += 6; 2059 mant <<= 6; 2060 } 2061 while (mant < 0x0010000000000000) 2062 { 2063 bc++; 2064 mant <<= 1; 2065 } 2066 mant |= nextbits >> (bitsper - bc); 2067 2068 rexp = 52 + resexp - bc - i * bitsper; 2069 2070 #ifdef DEBUGGING_PRINT 2071 printf("Normalised mantissa = 0x%016lx\n", mant); 2072 printf("Exponent to be inserted on mantissa = rexp = %d\n", rexp); 2073 #endif 2074 2075 /* Put the result exponent rexp onto the mantissa pattern */ 2076 u = ((unsigned long long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64; 2077 ux = (mant & MANTBITS_DP64) | u; 2078 if (determ) 2079 /* If we negated the mantissa we negate x too */ 2080 ux |= SIGNBIT_DP64; 2081 PUT_BITS_DP64(ux, dx); 2082 2083 #ifdef DEBUGGING_PRINT 2084 printf("(x*2/pi) = %25.20e = %s\n", dx, double2hex(&dx)); 2085 #endif 2086 2087 /* x is a double precision version of the fractional part of 2088 x * 2 / pi. Multiply x by pi/2 in double precision 2089 to get the reduced argument r. */ 2090 *r = dx * piby2; 2091 2092 #ifdef DEBUGGING_PRINT 2093 printf(" r = frac(x*2/pi) * pi/2:\n"); 2094 printf(" r = %25.20e = %s\n", *r, double2hex(r)); 2095 printf("region = (number of pi/2 subtracted from x) mod 4 = %d\n", 2096 *region); 2097 #endif 2098 } 2099 #endif /* USE_REMAINDER_PIBY2F_INLINE */ 2100 2101 #endif /* LIBM_INLINES_AMD_H_INCLUDED */ 2102