1 /* 2 * This file contains useful tools and data for the crlibm functions. 3 * 4 * Copyright (C) 2004-2011 David Defour, Catherine Daramy-Loirat, 5 * Florent de Dinechin, Matthieu Gallet, Nicolas Gast, Christoph Quirin Lauter, 6 * and Jean-Michel Muller 7 * 8 * This file is part of crlibm, the correctly rounded mathematical library, 9 * which has been developed by the Arénaire project at École normale supérieure 10 * de Lyon. 11 * 12 * This library is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU Lesser General Public 14 * License as published by the Free Software Foundation; either 15 * version 2.1 of the License, or (at your option) any later version. 16 * 17 * This library is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 * Lesser General Public License for more details. 21 * 22 * You should have received a copy of the GNU Lesser General Public 23 * License along with this library; if not, write to the Free Software 24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25 */ 26 27 28 #ifndef CRLIBM_PRIVATE_H 29 #define CRLIBM_PRIVATE_H 1 30 31 #include "scs_lib/scs.h" 32 #include "scs_lib/scs_private.h" 33 34 #ifdef HAVE_CONFIG_H 35 #include "crlibm_config.h" 36 #endif 37 /* otherwise CMake is used, and defines all the useful variables using -D switch */ 38 39 #ifdef HAVE_INTTYPES_H 40 #include <inttypes.h> 41 #endif 42 43 44 45 #if (defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64)) 46 # ifdef CRLIBM_HAS_FPU_CONTROL 47 # include <fpu_control.h> 48 # ifndef _FPU_SETCW 49 # define _FPU_SETCW(cw) __asm__ ("fldcw %0" : : "m" (*&cw)) 50 # endif 51 # ifndef _FPU_GETCW 52 # define _FPU_GETCW(cw) __asm__ ("fnstcw %0" : "=m" (*&cw)) 53 # endif 54 # endif 55 #endif 56 57 /* 64 bit arithmetic may be standardised, but people still do what they want */ 58 #ifdef HAVE_INTTYPES_H 59 #define ULL(bits) 0x##bits##uLL 60 #elif defined(_WIN32) 61 /* Windows garbage there */ 62 typedef long long int int64_t; 63 typedef unsigned long long int uint64_t; 64 #define ULL(bits) 0x##bits##i64 65 /* Default, hoping it works, hopefully less and less relevant */ 66 #else 67 typedef long long int int64_t; 68 typedef unsigned long long int uint64_t; 69 #define ULL(bits) 0x##bits##uLL 70 #endif 71 72 #ifndef SCS_DEF_INT64 73 #define SCS_DEF_INT64 74 #ifdef CRLIBM_TYPEOS_HPUX 75 #ifndef __LP64__ /* To solve the problem with 64 bits integer on HPPA */ 76 typedef long long int64_t; 77 typedef unsigned long long uint64_t; 78 #define ULL(bits) 0x##bits##uLL 79 #endif 80 #endif 81 #endif 82 83 84 85 86 /* The Add22 and Add22 functions, as well as double-double 87 multiplications of the Dekker family may be either defined as 88 functions, or as #defines. Which one is better depends on the 89 processor/compiler/OS. As #define has to be used with more care (not 90 type-safe), the two following variables should be set to 1 in the 91 development/debugging phase, until no type warning remains. 92 93 */ 94 95 #define ADD22_AS_FUNCTIONS 0 96 #define DEKKER_AS_FUNCTIONS 0 97 #define SQRT_AS_FUNCTIONS 0 98 99 /* The conditional version of the Add12 can be implemented either 100 using 3 floating point additions, a absolute value test and 101 a branch or using 6 floating point additions but no branch. 102 The Add22 sequence is similar. 103 The branchless versions might be faster on some systems. 104 105 The function versions of Add12Cond and Add22Cond are not 106 implemented in branchless versions. 107 */ 108 109 #define AVOID_BRANCHES 1 110 111 112 /* setting the following variable adds variables and code for 113 monitoring the performance. 114 Note that sometimes only round to nearest is instrumented */ 115 #define EVAL_PERF 1 116 117 118 #if EVAL_PERF==1 119 /* counter of calls to the second step (accurate step) */ 120 extern int crlibm_second_step_taken; 121 #endif 122 123 124 125 /* The prototypes of the second steps */ 126 /* extern void exp_SC(scs_ptr res_scs, double x);*/ 127 128 129 130 131 132 /* 133 * i = d in rounding to nearest 134 The constant added is 2^52 + 2^51 135 */ 136 #define DOUBLE2INT(_i, _d) \ 137 {db_number _t; \ 138 _t.d = (_d+6755399441055744.0); \ 139 _i = _t.i[LO];} 140 141 142 /* Same idea but beware: works only for |_i| < 2^51 -1 */ 143 #define DOUBLE2LONGINT(_i, _d) \ 144 { \ 145 db_number _t; \ 146 _t.d = (_d+6755399441055744.0); \ 147 if (_d >= 0) /* sign extend */ \ 148 _i = _t.l & ULL(0007FFFFFFFFFFFF); \ 149 else \ 150 _i = (_t.l & ULL(0007FFFFFFFFFFFF)) | (ULL(FFF8000000000000)); \ 151 } 152 153 154 155 156 157 /* Macros for the rounding tests in directed modes */ 158 /* After Evgeny Gvozdev pointed out a bug in the rounding procedures I 159 decided to centralize them here 160 161 Note that these tests launch the accurate phase when yl=0, in 162 particular in the exceptional cases when the image of a double is a 163 double. See the chapter about the log for an example 164 165 All this does not work for denormals, of course 166 */ 167 168 169 #define TEST_AND_RETURN_RU(__yh__, __yl__, __eps__) \ 170 { \ 171 db_number __yhdb__, __yldb__, u53; int yh_neg, yl_neg; \ 172 __yhdb__.d = __yh__; __yldb__.d = __yl__; \ 173 yh_neg = (__yhdb__.i[HI] & 0x80000000); \ 174 yl_neg = (__yldb__.i[HI] & 0x80000000); \ 175 __yhdb__.l = __yhdb__.l & 0x7fffffffffffffffLL; /* compute the absolute value*/ \ 176 __yldb__.l = __yldb__.l & 0x7fffffffffffffffLL; /* compute the absolute value*/ \ 177 u53.l = (__yhdb__.l & ULL(7ff0000000000000)) + ULL(0010000000000000); \ 178 if(__yldb__.d > __eps__ * u53.d){ \ 179 if(!yl_neg) { /* The case yl==0 is filtered by the above test*/ \ 180 /* return next up */ \ 181 __yhdb__.d = __yh__; \ 182 if(yh_neg) __yhdb__.l--; else __yhdb__.l++; /* Beware: fails for zero */ \ 183 return __yhdb__.d ; \ 184 } \ 185 else return __yh__; \ 186 } \ 187 } 188 189 190 #define TEST_AND_RETURN_RD(__yh__, __yl__, __eps__) \ 191 { \ 192 db_number __yhdb__, __yldb__, u53; int yh_neg, yl_neg; \ 193 __yhdb__.d = __yh__; __yldb__.d = __yl__; \ 194 yh_neg = (__yhdb__.i[HI] & 0x80000000); \ 195 yl_neg = (__yldb__.i[HI] & 0x80000000); \ 196 __yhdb__.l = __yhdb__.l & 0x7fffffffffffffffLL; /* compute the absolute value*/ \ 197 __yldb__.l = __yldb__.l & 0x7fffffffffffffffLL; /* compute the absolute value*/ \ 198 u53.l = (__yhdb__.l & ULL(7ff0000000000000)) + ULL(0010000000000000); \ 199 if(__yldb__.d > __eps__ * u53.d){ \ 200 if(yl_neg) { /* The case yl==0 is filtered by the above test*/ \ 201 /* return next down */ \ 202 __yhdb__.d = __yh__; \ 203 if(yh_neg) __yhdb__.l++; else __yhdb__.l--; /* Beware: fails for zero */ \ 204 return __yhdb__.d ; \ 205 } \ 206 else return __yh__; \ 207 } \ 208 } 209 210 211 212 #define TEST_AND_RETURN_RZ(__yh__, __yl__, __eps__) \ 213 { \ 214 db_number __yhdb__, __yldb__, u53; int yh_neg, yl_neg; \ 215 __yhdb__.d = __yh__; __yldb__.d = __yl__; \ 216 yh_neg = (__yhdb__.i[HI] & 0x80000000); \ 217 yl_neg = (__yldb__.i[HI] & 0x80000000); \ 218 __yhdb__.l = __yhdb__.l & ULL(7fffffffffffffff); /* compute the absolute value*/\ 219 __yldb__.l = __yldb__.l & ULL(7fffffffffffffff); /* compute the absolute value*/\ 220 u53.l = (__yhdb__.l & ULL(7ff0000000000000)) + ULL(0010000000000000); \ 221 if(__yldb__.d > __eps__ * u53.d){ \ 222 if(yl_neg!=yh_neg) { \ 223 __yhdb__.d = __yh__; \ 224 __yhdb__.l--; /* Beware: fails for zero */ \ 225 return __yhdb__.d ; \ 226 } \ 227 else return __yh__; \ 228 } \ 229 } 230 231 232 233 #define TEST_AND_COPY_RU(__cond__, __res__, __yh__, __yl__, __eps__) \ 234 { \ 235 db_number __yhdb__, __yldb__, u53; int yh_neg, yl_neg; \ 236 __yhdb__.d = __yh__; __yldb__.d = __yl__; \ 237 yh_neg = (__yhdb__.i[HI] & 0x80000000); \ 238 yl_neg = (__yldb__.i[HI] & 0x80000000); \ 239 __yhdb__.l = __yhdb__.l & 0x7fffffffffffffffLL; /* compute the absolute value*/ \ 240 __yldb__.l = __yldb__.l & 0x7fffffffffffffffLL; /* compute the absolute value*/ \ 241 u53.l = (__yhdb__.l & ULL(7ff0000000000000)) + ULL(0010000000000000); \ 242 __cond__ = 0; \ 243 if(__yldb__.d > __eps__ * u53.d){ \ 244 __cond__ = 1; \ 245 if(!yl_neg) { /* The case yl==0 is filtered by the above test*/ \ 246 /* return next up */ \ 247 __yhdb__.d = __yh__; \ 248 if(yh_neg) __yhdb__.l--; else __yhdb__.l++; /* Beware: fails for zero */ \ 249 __res__ = __yhdb__.d ; \ 250 } \ 251 else { \ 252 __res__ = __yh__; \ 253 } \ 254 } \ 255 } 256 257 #define TEST_AND_COPY_RD(__cond__, __res__, __yh__, __yl__, __eps__) \ 258 { \ 259 db_number __yhdb__, __yldb__, u53; int yh_neg, yl_neg; \ 260 __yhdb__.d = __yh__; __yldb__.d = __yl__; \ 261 yh_neg = (__yhdb__.i[HI] & 0x80000000); \ 262 yl_neg = (__yldb__.i[HI] & 0x80000000); \ 263 __yhdb__.l = __yhdb__.l & 0x7fffffffffffffffLL; /* compute the absolute value*/ \ 264 __yldb__.l = __yldb__.l & 0x7fffffffffffffffLL; /* compute the absolute value*/ \ 265 u53.l = (__yhdb__.l & ULL(7ff0000000000000)) + ULL(0010000000000000); \ 266 __cond__ = 0; \ 267 if(__yldb__.d > __eps__ * u53.d){ \ 268 __cond__ = 1; \ 269 if(yl_neg) { /* The case yl==0 is filtered by the above test*/ \ 270 /* return next down */ \ 271 __yhdb__.d = __yh__; \ 272 if(yh_neg) __yhdb__.l++; else __yhdb__.l--; /* Beware: fails for zero */ \ 273 __res__ = __yhdb__.d ; \ 274 } \ 275 else { \ 276 __res__ = __yh__; \ 277 } \ 278 } \ 279 } 280 281 282 #define TEST_AND_COPY_RZ(__cond__, __res__, __yh__, __yl__, __eps__) \ 283 { \ 284 db_number __yhdb__, __yldb__, u53; int yh_neg, yl_neg; \ 285 __yhdb__.d = __yh__; __yldb__.d = __yl__; \ 286 yh_neg = (__yhdb__.i[HI] & 0x80000000); \ 287 yl_neg = (__yldb__.i[HI] & 0x80000000); \ 288 __yhdb__.l = __yhdb__.l & ULL(7fffffffffffffff); /* compute the absolute value*/\ 289 __yldb__.l = __yldb__.l & ULL(7fffffffffffffff); /* compute the absolute value*/\ 290 u53.l = (__yhdb__.l & ULL(7ff0000000000000)) + ULL(0010000000000000); \ 291 __cond__ = 0; \ 292 if(__yldb__.d > __eps__ * u53.d){ \ 293 if(yl_neg!=yh_neg) { \ 294 __yhdb__.d = __yh__; \ 295 __yhdb__.l--; /* Beware: fails for zero */ \ 296 __res__ = __yhdb__.d ; \ 297 __cond__ = 1; \ 298 } \ 299 else { \ 300 __res__ = __yh__; \ 301 __cond__ = 1; \ 302 } \ 303 } 304 305 306 307 /* If the processor has a FMA, use it ! **/ 308 309 /* All this probably works only with gcc. 310 See Markstein book for the case of HP's compiler */ 311 312 #if defined(CRLIBM_TYPECPU_POWERPC) && defined(__GNUC__) 313 #define PROCESSOR_HAS_FMA 1 314 #define FMA(a,b,c) /* r = a*b + c*/ \ 315 ({ \ 316 double _a, _b,_c,_r; \ 317 _a=a; _b=b;_c=c; \ 318 __asm__ ("fmadd %0, %1, %2, %3\n ;;\n" \ 319 : "=f"(_r) \ 320 : "f"(_a), "f"(_b), "f"(_c) \ 321 ); \ 322 _r; \ 323 }) 324 325 326 #define FMS(a,b,c) /* r = a*b - c*/ \ 327 ({ \ 328 double _a, _b,_c,_r; \ 329 _a=a; _b=b;_c=c; \ 330 __asm__ ("fmsub %0, %1, %2, %3\n ;;\n" \ 331 : "=f"(_r) \ 332 : "f"(_a), "f"(_b), "f"(_c) \ 333 ); \ 334 _r; \ 335 }) 336 337 #endif /* defined(CRLIBM_TYPECPU_POWERPC) && defined(__GCC__) */ 338 339 340 341 342 /* On the Itanium 1 / gcc3.2 we lose 10 cycles when using the FMA !?! 343 It probably breaks the scheduling algorithms somehow... 344 To test again with higher gcc versions 345 */ 346 347 #if defined(CRLIBM_TYPECPU_ITANIUM) && defined(__GNUC__) && !defined(__INTEL_COMPILER) && 0 348 #define PROCESSOR_HAS_FMA 1 349 #define FMA(a,b,c) /* r = a*b + c*/ \ 350 ({ \ 351 double _a, _b,_c,_r; \ 352 _a=a; _b=b;_c=c; \ 353 __asm__ ("fma %0 = %1, %2, %3\n ;;\n" \ 354 : "=f"(_r) \ 355 : "f"(_a), "f"(_b), "f"(_c) \ 356 ); \ 357 _r; \ 358 }) 359 360 361 #define FMS(a,b,c) /* r = a*b - c*/ \ 362 ({ \ 363 double _a, _b, _c, _r; \ 364 _a=a; _b=b;_c=c; \ 365 __asm__ ("fms %0 = %1, %2, %3\n ;;\n" \ 366 : "=f"(_r) \ 367 : "f"(_a), "f"(_b), "f"(_c) \ 368 ); \ 369 _r; \ 370 }) 371 #endif /* defined(CRLIBM_TYPECPU_ITANIUM) && defined(__GCC__) && !defined(__INTEL_COMPILER) */ 372 373 374 375 376 #if defined(CRLIBM_TYPECPU_ITANIUM) && defined(__INTEL_COMPILER) 377 #define PROCESSOR_HAS_FMA 1 378 #if 0 /* Commented out because it shouldn't be there: There should be 379 a standard #include doing all this, but as of april 2005 380 it doesn't exist, say intel people). Leave 381 it as documentation, though, until it is replaced by #include 382 */ 383 /* Table 1-17: legal floating-point precision completers (.pc) */ 384 typedef enum { 385 _PC_S = 1 /* single .s */ 386 ,_PC_D = 2 /* double .d */ 387 ,_PC_NONE = 3 /* dynamic */ 388 } _Asm_pc; 389 390 /* Table 1-22: legal getf/setf floating-point register access completers */ 391 typedef enum { 392 _FR_S = 1 /* single form .s */ 393 ,_FR_D = 2 /* double form .d */ 394 ,_FR_EXP = 3 /* exponent form .exp */ 395 ,_FR_SIG = 4 /* significand form .sig */ 396 } _Asm_fr_access; 397 398 /* Table 1-24: legal floating-point FPSR status field completers (.sf) */ 399 typedef enum { 400 _SF0 = 0 /* FPSR status field 0 .s0 */ 401 ,_SF1 = 1 /* FPSR status field 1 .s1 */ 402 ,_SF2 = 2 /* FPSR status field 2 .s2 */ 403 ,_SF3 = 3 /* FPSR status field 3 .s3 */ 404 } _Asm_sf; 405 #endif 406 407 #define FMA(a,b,c) /* r = a*b + c*/ \ 408 _Asm_fma( 2/*_PC_D*/, a, b, c, 0/*_SF0*/ ); 409 410 411 #define FMS(a,b,c) /* r = a*b - c*/ \ 412 _Asm_fms( 2/*_PC_D*/, a, b, c, 0/*_SF0*/); 413 414 #endif /*defined(CRLIBM_TYPECPU_ITANIUM) && defined(__INTEL_COMPILER)*/ 415 416 417 418 419 420 421 422 423 #ifdef WORDS_BIGENDIAN 424 #define DB_ONE {{0x3ff00000, 0x00000000}} 425 #else 426 #define DB_ONE {{0x00000000 ,0x3ff00000}} 427 #endif 428 429 430 431 432 433 434 extern const scs scs_zer, scs_half, scs_one, scs_two, scs_sixinv; 435 436 437 #define SCS_ZERO (scs_ptr)(&scs_zer) 438 #define SCS_HALF (scs_ptr)(&scs_half) 439 #define SCS_ONE (scs_ptr)(&scs_one) 440 #define SCS_TWO (scs_ptr)(&scs_two) 441 #define SCS_SIXINV (scs_ptr)(&scs_sixinv) 442 443 444 445 446 #if defined(__GNUC__) 447 #define ABS(x) (__builtin_fabs((x))) 448 #else 449 #define ABS(x) (((x)>0) ? (x) : (-(x))) 450 #endif 451 452 453 454 455 /* 456 * In the following, when an operator is preceded by a '@' it means that we 457 * are considering the IEEE-compliant machine operator, otherwise it 458 * is the mathematical operator. 459 * 460 */ 461 462 463 /* 464 * computes s and r such that s + r = a + b, with s = a @+ b exactly 465 */ 466 #if AVOID_BRANCHES 467 #define Add12Cond(s, r, a, b) \ 468 { \ 469 double _u1, _u2, _u3, _u4; \ 470 double _a=a, _b=b; \ 471 \ 472 s = _a + _b; \ 473 _u1 = s - _a; \ 474 _u2 = s - _u1; \ 475 _u3 = _b - _u1; \ 476 _u4 = _a - _u2; \ 477 r = _u4 + _u3; \ 478 } 479 480 #else 481 #define Add12Cond(s, r, a, b) \ 482 {double _z, _a=a, _b=b; \ 483 s = _a + _b; \ 484 if (ABS(a) > ABS(b)){ \ 485 _z = s - _a; \ 486 r = _b - _z; \ 487 }else { \ 488 _z = s - _b; \ 489 r = _a - _z;}} 490 #endif 491 492 /* 493 * computes s and r such that s + r = a + b, with s = a @+ b exactly 494 * under the condition a >= b 495 */ 496 #define Add12(s, r, a, b) \ 497 {double _z, _a=a, _b=b; \ 498 s = _a + _b; \ 499 _z = s - _a; \ 500 r = _b - _z; } 501 502 503 /* 504 * computes r1, r2, r3 such that r1 + r2 + r3 = a + b + c exactly 505 */ 506 #define Fast3Sum(r1, r2, r3, a, b, c) \ 507 {double u, v, w; \ 508 Fast2Sum(u, v, b, c); \ 509 Fast2Sum(r1, w, a, u); \ 510 Fast2Sum(r2, r3, w, v); } 511 512 513 514 515 516 517 518 /* 519 * Functions to computes double-double addition: zh+zl = xh+xl + yh+yl 520 * knowing that xh>yh 521 * relative error is smaller than 2^-103 522 */ 523 524 525 #if ADD22_AS_FUNCTIONS 526 extern void Add22(double *zh, double *zl, double xh, double xl, double yh, double yl); 527 extern void Add22Cond(double *zh, double *zl, double xh, double xl, double yh, double yl); 528 529 #else /* ADD22_AS_FUNCTIONS */ 530 531 #if AVOID_BRANCHES 532 #define Add22Cond(zh,zl,xh,xl,yh,yl) \ 533 do { \ 534 double _v1, _v2, _v3, _v4; \ 535 \ 536 Add12Cond(_v1, _v2, (xh), (yh)); \ 537 _v3 = (xl) + (yl); \ 538 _v4 = _v2 + _v3; \ 539 Add12((*(zh)),(*(zl)),_v1,_v4); \ 540 } while (2+2==5) 541 #else 542 #define Add22Cond(zh,zl,xh,xl,yh,yl) \ 543 do { \ 544 double _r,_s; \ 545 _r = (xh)+(yh); \ 546 _s = ((ABS(xh)) > (ABS(yh)))? ((xh)-_r+(yh)+(yl)+(xl)) : ((yh)-_r+(xh)+(xl)+(yl)); \ 547 *zh = _r+_s; \ 548 *zl = (_r - (*zh)) + _s; \ 549 } while(2+2==5) 550 #endif 551 552 553 #define Add22(zh,zl,xh,xl,yh,yl) \ 554 do { \ 555 double _r,_s; \ 556 _r = (xh)+(yh); \ 557 _s = ((((xh)-_r) +(yh)) + (yl)) + (xl); \ 558 *zh = _r+_s; \ 559 *zl = (_r - (*zh)) + _s; \ 560 } while(0) 561 562 #endif /* ADD22_AS_FUNCTIONS */ 563 564 565 566 #ifdef PROCESSOR_HAS_FMA 567 /* One of the nice things with the fused multiply-and-add is that it 568 greatly simplifies the double-double multiplications : */ 569 #define Mul12(rh,rl,u,v) \ 570 { \ 571 *rh = u*v; \ 572 *rl = FMS(u,v, *rh); \ 573 } 574 575 #define Mul22(pzh,pzl, xh,xl, yh,yl) \ 576 { \ 577 double ph, pl; \ 578 ph = xh*yh; \ 579 pl = FMS(xh, yh, ph); \ 580 pl = FMA(xh,yl, pl); \ 581 pl = FMA(xl,yh,pl); \ 582 *pzh = ph+pl; \ 583 *pzl = ph - (*pzh); \ 584 *pzl += pl; \ 585 } 586 587 588 /* besides we don't care anymore about overflows in the mult */ 589 #define Mul12Cond Mul12 590 #define Mul22cond Mul22 591 592 593 #else /* ! PROCESSOR_HAS_FMA */ 594 595 596 #if DEKKER_AS_FUNCTIONS 597 extern void Mul12(double *rh, double *rl, double u, double v); 598 extern void Mul12Cond(double *rh, double *rl, double a, double b); 599 extern void Mul22(double *zh, double *zl, double xh, double xl, double yh, double yl); 600 #else /* if DEKKER_AS_FUNCTIONS */ 601 /* 602 * computes rh and rl such that rh + rl = a * b with rh = a @* b exactly 603 * under the conditions : a < 2^970 et b < 2^970 604 */ 605 #if 1 606 #define Mul12(rh,rl,u,v) \ 607 { \ 608 const double c = 134217729.; /* 2^27 +1 */ \ 609 double up, u1, u2, vp, v1, v2; \ 610 double _u=u, _v=v; \ 611 up = _u*c; vp = _v*c; \ 612 u1 = (_u-up)+up; v1 = (_v-vp)+vp; \ 613 u2 = _u-u1; v2 = _v-v1; \ 614 \ 615 *rh = _u*_v; \ 616 *rl = (((u1*v1-*rh)+(u1*v2))+(u2*v1))+(u2*v2);\ 617 } 618 #else 619 /* This works but is much slower. Problem: 620 SSE2 instructions are two-address, and intrinsincs are 3-address */ 621 #include<emmintrin.h> 622 #define Mul12(rh,rl,u,v) \ 623 { \ 624 const double c = 134217729.; /* 2^27 +1 */ \ 625 __m128d _u_v = _mm_set_pd (u,v); \ 626 __m128d c2=_mm_set1_pd(c); \ 627 c2 = _mm_mul_pd(c2, _u_v); \ 628 __m128d u1v1 = _mm_sub_pd(_u_v, c2); \ 629 u1v1 = _mm_add_pd(u1v1, c2); \ 630 __m128d u2v2 = _mm_sub_pd(_u_v, u1v1); \ 631 __m128d _v_u = _mm_shuffle_pd(_u_v, _u_v, _MM_SHUFFLE2 (0,1)); \ 632 __m128d rhrh = _mm_mul_pd(_v_u, _u_v); \ 633 _mm_store_sd (rh, rhrh); \ 634 __m128d v2u2 = _mm_shuffle_pd(u2v2, u2v2, _MM_SHUFFLE2 (0,1)); \ 635 __m128d u1v2u2v1 = _mm_mul_pd(u1v1, v2u2); \ 636 __m128d u2v1u1v2 = _mm_shuffle_pd(u1v2u2v1, u1v2u2v1, _MM_SHUFFLE2 (0,1)); \ 637 __m128d uvmed = _mm_add_pd(u1v2u2v1, u2v1u1v2); \ 638 __m128d u1u2 = _mm_shuffle_pd(u1v1, u2v2, _MM_SHUFFLE2 (1,1)); \ 639 __m128d v1v2 = _mm_shuffle_pd(u1v1, u2v2, _MM_SHUFFLE2 (0,0)); \ 640 __m128d u1v1u2v2 = _mm_mul_pd(u1u2, v1v2); \ 641 __m128d tmp = _mm_sub_pd(u1v1u2v2, rhrh); \ 642 tmp = _mm_add_pd(tmp, uvmed); \ 643 __m128d u2v2u2v2 = _mm_mul_pd(u2v2, v2u2); \ 644 tmp = _mm_add_pd(tmp, u2v2u2v2); \ 645 _mm_store_sd (rl, tmp); \ 646 } 647 #endif 648 649 /* 650 double _u =u, _v=v; \ 651 __m128d _u_v = _mm_set_pd(_u, _v); \ 652 */ \ 653 /* 654 * Computes rh and rl such that rh + rl = a * b and rh = a @* b exactly 655 */ 656 #define Mul12Cond(rh, rl, a, b) \ 657 {\ 658 const double two_em53 = 1.1102230246251565404e-16; /* 0x3CA00000, 0x00000000 */\ 659 const double two_e53 = 9007199254740992.; /* 0x43400000, 0x00000000 */\ 660 double u, v; \ 661 db_number _a=a, _b=b; \ 662 \ 663 if (_a.i[HI]>0x7C900000) u = _a*two_em53; \ 664 else u = _a; \ 665 if (_b.i[HI]>0x7C900000) v = _b*two_em53; \ 666 else v = _b; \ 667 \ 668 Mul12(rh, rl, u, v); \ 669 \ 670 if (_a.i[HI]>0x7C900000) {*rh *= two_e53; *rl *= two_e53;} \ 671 if (_b.i[HI]>0x7C900000) {*rh *= two_e53; *rl *= two_e53;} \ 672 } 673 674 675 676 /* 677 * computes double-double multiplication: zh+zl = (xh+xl) * (yh+yl) 678 * relative error is smaller than 2^-102 679 */ 680 681 682 683 #define Mul22(zh,zl,xh,xl,yh,yl) \ 684 { \ 685 double mh, ml; \ 686 \ 687 const double c = 134217729.; \ 688 double up, u1, u2, vp, v1, v2; \ 689 \ 690 up = (xh)*c; vp = (yh)*c; \ 691 u1 = ((xh)-up)+up; v1 = ((yh)-vp)+vp; \ 692 u2 = (xh)-u1; v2 = (yh)-v1; \ 693 \ 694 mh = (xh)*(yh); \ 695 ml = (((u1*v1-mh)+(u1*v2))+(u2*v1))+(u2*v2); \ 696 \ 697 ml += (xh)*(yl) + (xl)*(yh); \ 698 *zh = mh+ml; \ 699 *zl = mh - (*zh) + ml; \ 700 } 701 702 703 704 #endif /* DEKKER_AS_FUNCTIONS */ 705 706 #endif /* PROCESSOR_HAS_FMA */ 707 708 /* Additional double-double operators */ 709 710 /* Eps Mul122 <= 2^-102 */ 711 #define Mul122(resh,resl,a,bh,bl) \ 712 { \ 713 double _t1, _t2, _t3, _t4; \ 714 \ 715 Mul12(&_t1,&_t2,(a),(bh)); \ 716 _t3 = (a) * (bl); \ 717 _t4 = _t2 + _t3; \ 718 Add12((*(resh)),(*(resl)),_t1,_t4); \ 719 } 720 721 /* Eps MulAdd212 <= 2^-100 for |a * (bh + bl)| <= 1/4 * |ch + cl| */ 722 #define MulAdd212(resh,resl,ch,cl,a,bh,bl) \ 723 { \ 724 double _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8; \ 725 \ 726 Mul12(&_t1,&_t2,(a),(bh)); \ 727 Add12(_t3,_t4,(ch),_t1); \ 728 _t5 = (bl) * (a); \ 729 _t6 = (cl) + _t2; \ 730 _t7 = _t5 + _t6; \ 731 _t8 = _t7 + _t4; \ 732 Add12((*(resh)),(*(resl)),_t3,_t8); \ 733 } 734 735 /* Eps MulAdd212 <= 2^-100 736 for |(ah + bh) * (bh + bl)| <= 1/4 * |ch + cl| 737 */ 738 #define MulAdd22(resh,resl,ch,cl,ah,al,bh,bl) \ 739 { \ 740 double _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8; \ 741 double _t9, _t10; \ 742 \ 743 Mul12(&_t1,&_t2,(ah),(bh)); \ 744 Add12(_t3,_t4,(ch),_t1); \ 745 _t5 = (ah) * (bl); \ 746 _t6 = (al) * (bh); \ 747 _t7 = _t2 + (cl); \ 748 _t8 = _t4 + _t7; \ 749 _t9 = _t5 + _t6; \ 750 _t10 = _t8 + _t9; \ 751 Add12((*(resh)),(*(resl)),_t3,_t10); \ 752 } 753 754 #define Add122(resh,resl,a,bh,bl) \ 755 { \ 756 double _t1, _t2, _t3; \ 757 \ 758 Add12(_t1,_t2,(a),(bh)); \ 759 _t3 = _t2 + (bl); \ 760 Add12((*(resh)),(*(resl)),_t1,_t3); \ 761 } 762 763 #define Add122Cond(resh,resl,a,bh,bl) \ 764 { \ 765 double _t1, _t2, _t3; \ 766 \ 767 Add12Cond(_t1,_t2,(a),(bh)); \ 768 _t3 = _t2 + (bl); \ 769 Add12((*(resh)),(*(resl)),_t1,_t3); \ 770 } 771 772 773 #define Add212(resh,resl,ah,al,b) \ 774 { \ 775 double _t1, _t2, _t3; \ 776 \ 777 Add12(_t1,_t2,(ah),b); \ 778 _t3 = _t2 + (al); \ 779 Add12((*(resh)),(*(resl)),_t1,_t3); \ 780 } 781 782 783 /* In the following the one-line computation of _cl was split so that 784 icc(8.1) would compile it properly. It's a bug of icc */ 785 786 #if DEKKER_AS_FUNCTIONS 787 extern void Div22(double *z, double *zz, double x, double xx, double y, double yy); 788 #else 789 #define Div22(pzh,pzl,xh,xl,yh,yl) { \ 790 double _ch,_cl,_uh,_ul; \ 791 _ch=(xh)/(yh); Mul12(&_uh,&_ul,_ch,(yh)); \ 792 _cl=((xh)-_uh); \ 793 _cl -= _ul; \ 794 _cl += (xl); \ 795 _cl -= _ch*(yl); \ 796 _cl /= (yh); \ 797 *pzh=_ch+_cl; *pzl=(_ch-(*pzh))+_cl; \ 798 } 799 #endif /* DEKKER_AS_FUNCTIONS */ 800 801 802 803 /* 804 Coefficients for 1/sqrt(m) with 1/2 < m < 2 805 The corresponding relative polynomial approximation error is less than 806 eps < 2^(-8.3127) (cf. Maple file) 807 The Itanium instruction frsqrta is slightly more accurate; it can 808 therefore easily replace the polynomial evaluation. 809 */ 810 811 #define SQRTPOLYC0 2.50385236695888790947606139525305479764938354492188e+00 812 #define SQRTPOLYC1 -3.29763389114324168005509818613063544034957885742188e+00 813 #define SQRTPOLYC2 2.75726076139124520736345402838196605443954467773438e+00 814 #define SQRTPOLYC3 -1.15233725777933848632983426796272397041320800781250e+00 815 #define SQRTPOLYC4 1.86900066679800969104974228685023263096809387207031e-01 816 #define SQRTTWO52 4.50359962737049600000000000000000000000000000000000e+15 817 818 #if SQRT_AS_FUNCTIONS 819 extern void sqrt12(double *resh, double *resl, double x); 820 #else 821 822 /* Concerning special case handling see crlibm_private.h */ 823 #define sqrt12(resh, resl, x) { \ 824 db_number _xdb; \ 825 int _E; \ 826 double _m, _r0, _r1, _r2, _r3h, _r3l, _r4h, _r4l, _srtmh, _srtml; \ 827 double _r2PHr2h, _r2PHr2l, _r2Sqh, _r2Sql; \ 828 double _mMr2h, _mMr2l, _mMr2Ch, _mMr2Cl; \ 829 double _MHmMr2Ch, _MHmMr2Cl; \ 830 double _r3Sqh, _r3Sql, _mMr3Sqh, _mMr3Sql; \ 831 double _half; \ 832 \ 833 /* Special case x = 0 */ \ 834 if ((x) == 0.0) { \ 835 (*(resh)) = (x); \ 836 (*(resl)) = 0.0; \ 837 } else { \ 838 \ 839 _E = 0; \ 840 \ 841 /* Convert to integer format */ \ 842 _xdb.d = (x); \ 843 \ 844 /* Handle subnormal case */ \ 845 if (_xdb.i[HI] < 0x00100000) { \ 846 _E = -52; \ 847 _xdb.d *= ((db_number) ((double) SQRTTWO52)).d; \ 848 /* make x a normal number */ \ 849 } \ 850 \ 851 /* Extract exponent E and mantissa m */ \ 852 _E += (_xdb.i[HI]>>20)-1023; \ 853 _xdb.i[HI] = (_xdb.i[HI] & 0x000fffff) | 0x3ff00000; \ 854 _m = _xdb.d; \ 855 \ 856 _half = 0.5; \ 857 /* Make exponent even */ \ 858 if (_E & 0x00000001) { \ 859 _E++; \ 860 _m *= _half; /* Suppose now 1/2 <= m <= 2 */ \ 861 } \ 862 \ 863 /* Construct sqrt(2^E) = 2^(E/2) */ \ 864 _xdb.i[HI] = (_E/2 + 1023) << 20; \ 865 _xdb.i[LO] = 0; \ 866 \ 867 /* Compute initial approximation to r = 1/sqrt(m) */ \ 868 \ 869 _r0 = SQRTPOLYC0 + \ 870 _m * (SQRTPOLYC1 + _m * (SQRTPOLYC2 + _m * (SQRTPOLYC3 + _m * SQRTPOLYC4))); \ 871 \ 872 /* Iterate two times on double precision */ \ 873 \ 874 _r1 = _half * _r0 * (3.0 - _m * (_r0 * _r0)); \ 875 _r2 = _half * _r1 * (3.0 - _m * (_r1 * _r1)); \ 876 \ 877 /* Iterate two times on double-double precision */ \ 878 \ 879 Mul12(&_r2Sqh, &_r2Sql, _r2, _r2); \ 880 Add12(_r2PHr2h, _r2PHr2l, _r2, (_half * _r2)); \ 881 Mul12(&_mMr2h, &_mMr2l, _m, _r2); \ 882 Mul22(&_mMr2Ch, &_mMr2Cl, _mMr2h, _mMr2l, _r2Sqh, _r2Sql); \ 883 \ 884 _MHmMr2Ch = -_half * _mMr2Ch; \ 885 _MHmMr2Cl = -_half * _mMr2Cl; \ 886 \ 887 Add22(&_r3h, &_r3l, _r2PHr2h, _r2PHr2l, _MHmMr2Ch, _MHmMr2Cl); \ 888 \ 889 Mul22(&_r3Sqh, &_r3Sql, _r3h, _r3l, _r3h, _r3l); \ 890 Mul22(&_mMr3Sqh, &_mMr3Sql, _m, 0.0, _r3Sqh, _r3Sql); \ 891 /* To prove: mMr3Sqh = 1.0 in each case */ \ 892 \ 893 Mul22(&_r4h, &_r4l, _r3h, _r3l, 1.0, (-_half * _mMr3Sql)); \ 894 \ 895 /* Multiply obtained reciprocal square root by m */ \ 896 \ 897 Mul22(&_srtmh,&_srtml,_m,0.0,_r4h,_r4l); \ 898 \ 899 /* Multiply componentwise by sqrt(2^E) */ \ 900 /* which is an integer power of 2 that may not produce a subnormal */ \ 901 \ 902 (*(resh)) = _xdb.d * _srtmh; \ 903 (*(resl)) = _xdb.d * _srtml; \ 904 \ 905 } /* End: special case 0 */ \ 906 } 907 908 909 #define sqrt12_64(resh, resl, x) { \ 910 db_number _xdb; \ 911 int _E; \ 912 double _m, _r0, _r1, _r2, _r3h, _r3l, _r4h, _r4l, _srtmh, _srtml; \ 913 double _r2PHr2h, _r2PHr2l, _r2Sqh, _r2Sql; \ 914 double _mMr2h, _mMr2l, _mMr2Ch, _mMr2Cl; \ 915 double _MHmMr2Ch, _MHmMr2Cl; \ 916 double _r3Sqh, _r3Sql, _mMr3Sqh, _mMr3Sql; \ 917 double _half; \ 918 \ 919 /* Special case x = 0 */ \ 920 if ((x) == 0.0) { \ 921 (*(resh)) = (x); \ 922 (*(resl)) = 0.0; \ 923 } else { \ 924 \ 925 _E = 0.0; \ 926 \ 927 /* Convert to integer format */ \ 928 _xdb.d = (x); \ 929 \ 930 /* Handle subnormal case */ \ 931 if (_xdb.i[HI] < 0x00100000) { \ 932 _E = -52; \ 933 _xdb.d *= ((db_number) ((double) SQRTTWO52)).d; \ 934 /* make x a normal number */ \ 935 } \ 936 \ 937 /* Extract exponent E and mantissa m */ \ 938 _E += (_xdb.i[HI]>>20)-1023; \ 939 _xdb.i[HI] = (_xdb.i[HI] & 0x000fffff) | 0x3ff00000; \ 940 _m = _xdb.d; \ 941 \ 942 _half = 0.5; \ 943 /* Make exponent even */ \ 944 if (_E & 0x00000001) { \ 945 _E++; \ 946 _m *= _half; /* Suppose now 1/2 <= m <= 2 */ \ 947 } \ 948 \ 949 /* Construct sqrt(2^E) = 2^(E/2) */ \ 950 _xdb.i[HI] = (_E/2 + 1023) << 20; \ 951 _xdb.i[LO] = 0; \ 952 \ 953 /* Compute initial approximation to r = 1/sqrt(m) */ \ 954 \ 955 _r0 = SQRTPOLYC0 + \ 956 _m * (SQRTPOLYC1 + _m * (SQRTPOLYC2 + _m * (SQRTPOLYC3 + _m * SQRTPOLYC4))); \ 957 \ 958 /* Iterate two times on double precision */ \ 959 \ 960 _r1 = _half * _r0 * (3.0 - _m * (_r0 * _r0)); \ 961 _r2 = _half * _r1 * (3.0 - _m * (_r1 * _r1)); \ 962 \ 963 /* Iterate once on double-double precision */ \ 964 \ 965 Mul12(&_r2Sqh, &_r2Sql, _r2, _r2); \ 966 Add12(_r2PHr2h, _r2PHr2l, _r2, (_half * _r2)); \ 967 Mul12(&_mMr2h, &_mMr2l, _m, _r2); \ 968 Mul22(&_mMr2Ch, &_mMr2Cl, _mMr2h, _mMr2l, _r2Sqh, _r2Sql); \ 969 \ 970 _MHmMr2Ch = -_half * _mMr2Ch; \ 971 _MHmMr2Cl = -_half * _mMr2Cl; \ 972 \ 973 Add22(&_r3h, &_r3l, _r2PHr2h, _r2PHr2l, _MHmMr2Ch, _MHmMr2Cl); \ 974 \ 975 /* Multiply obtained reciprocal square root by m */ \ 976 \ 977 Mul22(&_srtmh,&_srtml,_m,0.0,_r3h,_r3l); \ 978 \ 979 /* Multiply componentwise by sqrt(2^E) */ \ 980 /* which is an integer power of 2 that may not produce a subnormal */ \ 981 \ 982 (*(resh)) = _xdb.d * _srtmh; \ 983 (*(resl)) = _xdb.d * _srtml; \ 984 \ 985 } /* End: special case 0 */ \ 986 } 987 988 /* 989 sqrt12_64_unfiltered = sqrt(x) * (1 + eps) where abs(eps) <= 2^(-64) 990 991 if x is neither subnormal nor 0 992 993 */ 994 #define sqrt12_64_unfiltered(resh, resl, x) { \ 995 db_number _xdb; \ 996 int _E; \ 997 double _m, _r0, _r1, _r2, _r3h, _r3l, _srtmh, _srtml; \ 998 double _r2PHr2h, _r2PHr2l, _r2Sqh, _r2Sql; \ 999 double _mMr2h, _mMr2l, _mMr2Ch, _mMr2Cl; \ 1000 double _MHmMr2Ch, _MHmMr2Cl; \ 1001 double _half; \ 1002 \ 1003 \ 1004 \ 1005 /* Convert to integer format */ \ 1006 _xdb.d = (x); \ 1007 \ 1008 \ 1009 /* Extract exponent E and mantissa m */ \ 1010 _E = (_xdb.i[HI]>>20)-1023; \ 1011 _xdb.i[HI] = (_xdb.i[HI] & 0x000fffff) | 0x3ff00000; \ 1012 _m = _xdb.d; \ 1013 \ 1014 _half = 0.5; \ 1015 /* Make exponent even */ \ 1016 if (_E & 0x00000001) { \ 1017 _E++; \ 1018 _m *= _half; /* Suppose now 1/2 <= m <= 2 */ \ 1019 } \ 1020 \ 1021 /* Construct sqrt(2^E) = 2^(E/2) */ \ 1022 _xdb.i[HI] = (_E/2 + 1023) << 20; \ 1023 _xdb.i[LO] = 0; \ 1024 \ 1025 /* Compute initial approximation to r = 1/sqrt(m) */ \ 1026 \ 1027 _r0 = SQRTPOLYC0 + \ 1028 _m * (SQRTPOLYC1 + _m * (SQRTPOLYC2 + _m * (SQRTPOLYC3 + _m * SQRTPOLYC4))); \ 1029 \ 1030 /* Iterate two times on double precision */ \ 1031 \ 1032 _r1 = _half * _r0 * (3.0 - _m * (_r0 * _r0)); \ 1033 _r2 = _half * _r1 * (3.0 - _m * (_r1 * _r1)); \ 1034 \ 1035 /* Iterate once on double-double precision */ \ 1036 \ 1037 Mul12(&_r2Sqh, &_r2Sql, _r2, _r2); \ 1038 Add12(_r2PHr2h, _r2PHr2l, _r2, (_half * _r2)); \ 1039 Mul12(&_mMr2h, &_mMr2l, _m, _r2); \ 1040 Mul22(&_mMr2Ch, &_mMr2Cl, _mMr2h, _mMr2l, _r2Sqh, _r2Sql); \ 1041 \ 1042 _MHmMr2Ch = -_half * _mMr2Ch; \ 1043 _MHmMr2Cl = -_half * _mMr2Cl; \ 1044 \ 1045 Add22(&_r3h, &_r3l, _r2PHr2h, _r2PHr2l, _MHmMr2Ch, _MHmMr2Cl); \ 1046 \ 1047 /* Multiply obtained reciprocal square root by m */ \ 1048 \ 1049 Mul122(&_srtmh,&_srtml,_m,_r3h,_r3l); \ 1050 \ 1051 /* Multiply componentwise by sqrt(2^E) */ \ 1052 /* which is an integer power of 2 that may not produce a subnormal */ \ 1053 \ 1054 (*(resh)) = _xdb.d * _srtmh; \ 1055 (*(resl)) = _xdb.d * _srtml; \ 1056 \ 1057 } 1058 1059 1060 1061 #endif /*SQRT_AS_FUNCTIONS*/ 1062 1063 /* Declaration of the debug function */ 1064 1065 void printHexa(char* s, double x); 1066 1067 1068 #endif /*CRLIBM_PRIVATE_H*/ 1069