1 #ifndef __DOUBLE_EXTENDED_H 2 #define __DOUBLE_EXTENDED_H 3 4 #if (defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64)) 5 #include <fpu_control.h> 6 typedef long double double_ext; 7 #endif 8 #if defined(CRLIBM_TYPECPU_ITANIUM) && defined(__INTEL_COMPILER) 9 typedef __fpreg double_ext; 10 #endif 11 12 13 /* For debugging */ 14 typedef union { 15 int i[3]; 16 long double d; 17 } db_ext_number; 18 19 20 #define DE_EXP 2 21 #define DE_MANTISSA_HI 1 22 #define DE_MANTISSA_LO 0 23 24 #define print_ext(_s, _y) {\ 25 db_ext_number _yy; _yy.d=_y; \ 26 printf("%s %04x %08x %08x \n",_s, 0xffff&_yy.i[DE_EXP], _yy.i[DE_MANTISSA_HI], _yy.i[DE_MANTISSA_LO]); \ 27 } 28 29 /**************************************************************************************/ 30 /*********************************Rounding tests***************************************/ 31 /**************************************************************************************/ 32 33 34 /* These test work by observing the bits of your double-extended after the 53rd. 35 36 mask should be 7ff if you trust your 64 bits (hum) 37 7fe if you trust 63 (if you have proven that maxepsilon<2^(-63) ) 38 7fc 62 39 7f8 61 40 7f0 60 etc 41 */ 42 43 /* Mask constants for rounding test */ 44 45 #define ACCURATE_TO_64_BITS 0x7ff 46 #define ACCURATE_TO_63_BITS 0x7fe 47 #define ACCURATE_TO_62_BITS 0x7fc 48 #define ACCURATE_TO_61_BITS 0x7f8 49 #define ACCURATE_TO_60_BITS 0x7f0 50 #define ACCURATE_TO_59_BITS 0x7e0 51 52 53 #if (defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64)) 54 55 static const unsigned short RN_Double = (_FPU_DEFAULT & ~_FPU_EXTENDED)|_FPU_DOUBLE; 56 static const unsigned short RN_DoubleDown = (_FPU_DEFAULT & ~_FPU_EXTENDED & ~_FPU_RC_NEAREST)|_FPU_DOUBLE | _FPU_RC_DOWN; 57 static const unsigned short RN_DoubleUp = (_FPU_DEFAULT & ~_FPU_EXTENDED & ~_FPU_RC_NEAREST)|_FPU_DOUBLE | _FPU_RC_UP; 58 static const unsigned short RN_DoubleExt = _FPU_DEFAULT; 59 60 #define DOUBLE_EXTENDED_MODE _FPU_SETCW(RN_DoubleExt) 61 #define DOUBLE_UP_MODE _FPU_SETCW(RN_DoubleUp) 62 #define DOUBLE_DOWN_MODE _FPU_SETCW(RN_DoubleDown) 63 #define BACK_TO_DOUBLE_MODE _FPU_SETCW(RN_Double) 64 65 66 /* 67 Two rounding tests to the nearest. On Pentium 3, gcc3.3, the second 68 is faster by 12 cycles (and also improves the worst-case time by 60 69 cycles since it doesn't switch processor rounding mode in this 70 case). However it uses a coarser error estimation. 71 */ 72 73 #define DE_TEST_AND_RETURN_RN_ZIV(y,rncst) \ 74 { double yh, yl; \ 75 yh = (double) y; \ 76 yl = y-yh; \ 77 BACK_TO_DOUBLE_MODE; \ 78 if(yh==yh + yl*rncst) return yh; \ 79 DOUBLE_EXTENDED_MODE; \ 80 } 81 82 83 #define DE_TEST_AND_RETURN_RN(_y, _mask) \ 84 { \ 85 db_ext_number _z; double _yd; \ 86 int _lo; \ 87 _z.d = _y; \ 88 _yd = (double) _y; \ 89 _lo = _z.i[DE_MANTISSA_LO] &(_mask); \ 90 if((_lo!=(0x3ff&(_mask))) && (_lo!= (0x400&(_mask)))) { \ 91 BACK_TO_DOUBLE_MODE; \ 92 return _yd; \ 93 } \ 94 } 95 96 97 #define DE_TEST_AND_RETURN_RD(_y, _mask) \ 98 { \ 99 double _result; int _bits; \ 100 db_ext_number _z; \ 101 _z.d = _y; \ 102 DOUBLE_DOWN_MODE; \ 103 _bits = _z.i[DE_MANTISSA_LO] &(_mask); \ 104 _result = (double)(_y); \ 105 if( (_bits != (0xfff&(_mask))) && (_bits != (0x000&(_mask))) ) { \ 106 BACK_TO_DOUBLE_MODE; \ 107 return _result; \ 108 } \ 109 DOUBLE_EXTENDED_MODE; \ 110 } 111 #define DE_TEST_AND_RETURN_RU(_y, _mask) \ 112 { \ 113 double _result; int _bits; \ 114 db_ext_number _z; \ 115 _z.d = _y; \ 116 DOUBLE_UP_MODE; \ 117 _bits = _z.i[DE_MANTISSA_LO] &(_mask); \ 118 _result = (double)(_y); \ 119 if( (_bits != (0xfff&(_mask))) && (_bits != (0x000&(_mask))) ) { \ 120 BACK_TO_DOUBLE_MODE; \ 121 return _result; \ 122 } \ 123 DOUBLE_EXTENDED_MODE; \ 124 } 125 126 127 /* Use this one if you want a final computation step to overlap with 128 the rounding test. Examples: multiplication by a sign or by a power of 2 */ 129 130 #define DE_TEST_AND_RETURN_RN2(_ytest, _yreturn, _mask) \ 131 { \ 132 db_ext_number _z; double _y_return_d; \ 133 int _bits; \ 134 _z.d = _ytest; \ 135 _y_return_d = (double) (_yreturn); \ 136 _bits = _z.i[DE_MANTISSA_LO] &(_mask); \ 137 if((_bits!=(0x3ff&(_mask))) && (_bits!= (0x400&(_mask)))) {\ 138 BACK_TO_DOUBLE_MODE; \ 139 return _y_return_d; \ 140 } \ 141 } 142 143 144 145 146 /* Slow macros with two changes of precision, but who cares, they are 147 used at the end of the second step */ 148 #define RETURN_SUM_ROUNDED_DOWN(_rh, _rl) {\ 149 double _result; \ 150 DOUBLE_DOWN_MODE; \ 151 _result = (_rh+_rl); \ 152 BACK_TO_DOUBLE_MODE; \ 153 return _result; \ 154 } 155 156 #define RETURN_SUM_ROUNDED_UP(_rh, _rl) {\ 157 double _result; \ 158 DOUBLE_UP_MODE; \ 159 _result = (_rh+_rl); \ 160 BACK_TO_DOUBLE_MODE; \ 161 return _result; \ 162 } 163 164 #else /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */ 165 166 167 168 169 170 #if !defined(CRLIBM_TYPECPU_ITANIUM) 171 #error "This file should be compiled only for IA32 or IA64 architecture " 172 #endif 173 174 /* TODO Add what it takes to compile under HP-UX */ 175 #if !defined(__INTEL_COMPILER) 176 #error "Use icc, version 8.1 or higher to compile for IA64 architecture" 177 #endif 178 179 180 #define DOUBLE_EXTENDED_MODE {} 181 #define BACK_TO_DOUBLE_MODE {} 182 183 184 185 186 187 #define DE_TEST_AND_RETURN_RN(_y, _mask) \ 188 { uint64_t _mantissa, _bits; double _yd; \ 189 _yd = _Asm_fma(2/*_FR_D*/, 1.0, _y, 0.0, 0/*SF0*/); \ 190 _mantissa = _Asm_getf(4/*_FR_SIG*/, _y); \ 191 _bits = _mantissa & (0x7ff&(_mask)); \ 192 if(__builtin_expect( \ 193 (_bits!=(0x3ff&(_mask))) && (_bits != (0x400&(_mask))), \ 194 1+1==2)) \ 195 return _yd; \ 196 } 197 198 199 /* Slower by 5 cycles as of 2005... Let us keep it, you never know */ 200 #define DE_TEST_AND_RETURN_RN_ZIV(y,rncst) \ 201 { double yh, yl; \ 202 yh = (double) y; \ 203 yl = y-yh; \ 204 if(__builtin_expect(yh == yh + yl*rncst, 1+1==2)) return yh; \ 205 } 206 207 /* Use this one if you want a final computation step to overlap with 208 the rounding test. Examples: multiplication by a sign or by a power of 2 */ 209 210 #define DE_TEST_AND_RETURN_RN2(_ytest, _yreturn, _mask) \ 211 { uint64_t _mantissa, _bits; \ 212 _mantissa = _Asm_getf(4/*_FR_SIG*/, _ytest); \ 213 _bits = _mantissa & (0x7ff&(_mask)); \ 214 if(__builtin_expect( \ 215 (_bits!=(0x3ff&(_mask))) && (_bits != (0x400&(_mask))), \ 216 1+1==2)) \ 217 return _yreturn; \ 218 } 219 220 221 #define DE_TEST_AND_RETURN_RD(_y, _mask) \ 222 { uint64_t _mantissa, _bits; double _yd; \ 223 _yd = _Asm_fma(2/*_FR_D*/, -1.0, _y, 0.0, 3/*SF3*/); \ 224 _mantissa = _Asm_getf(4/*_FR_SIG*/, _y); \ 225 _bits = _mantissa & (0x7ff&(_mask)); \ 226 if(__builtin_expect( \ 227 (_bits!=(0x000&(_mask))) && (_bits != (0x7ff&(_mask))), \ 228 1+1==2)) \ 229 return -_yd; \ 230 } 231 232 #define DE_TEST_AND_RETURN_RU(_y, _mask) \ 233 { uint64_t _mantissa, _bits; double _yd; \ 234 _yd = _Asm_fma(2/*_FR_D*/, 1.0, _y, 0.0, 3/*SF3*/); \ 235 _mantissa = _Asm_getf(4/*_FR_SIG*/, _y); \ 236 _bits = _mantissa & (0x7ff&(_mask)); \ 237 if(__builtin_expect( \ 238 (_bits!=(0x000&(_mask))) && (_bits != (0x7ff&(_mask))), \ 239 1+1==2)) \ 240 return _yd; \ 241 } 242 243 #define RETURN_SUM_ROUNDED_DOWN(_rh, _rl) \ 244 return -_Asm_fma(2/*_FR_D*/, -1.0, _rh, -_rl, 3/*SF3*/); 245 246 #define RETURN_SUM_ROUNDED_UP(_rh, _rl) \ 247 return _Asm_fma(2/*_FR_D*/, 1.0, _rh, _rl, 3/*SF3*/); 248 249 250 #if 0 251 252 /* This test doesn'use SF2 and SF3 Kept as a model for Pentium implementation, to erase afterwards */ 253 254 #define DE_TEST_AND_RETURN_RU(_y, _mask) \ 255 { \ 256 db_ext_number _z; double _yd; \ 257 unsigned long int _mantissa, _y_double, _isNegative ,_wasRoundedUp, _bits; \ 258 _yd=(double) _y; \ 259 _mantissa = _Asm_getf(4/*_FR_SIG*/, _y); \ 260 _y_double = _Asm_getf(2/*_FR_D*/, _yd); \ 261 _bits = _mantissa & (0x7ff&(_mask)); \ 262 _wasRoundedUp = ((_mantissa >>11) & 1) != (_y_double & 1); \ 263 _bits = _mantissa & (0x7ff&(_mask)); \ 264 _isNegative = _y_double >> 63; \ 265 if(_isNegative) { \ 266 if(_wasRoundedUp) { /* RN was RD */ \ 267 if( _bits != (0x7ff&(_mask)) ) { \ 268 _y_double--; \ 269 return (double) _Asm_setf(2/*_FR_D*/, _y_double); \ 270 } /* else launch accurate phase */ \ 271 } \ 272 else{ /* RN was RU, so need to check */ \ 273 if( _bits != (0x000&(_mask)) ) { \ 274 return _yd; \ 275 } /* else launch accurate phase */ \ 276 } \ 277 } \ 278 else{ /* Positive number */ \ 279 if(_wasRoundedUp) { /* RN was RU */ \ 280 if( _bits != (0x7ff&(_mask)) ) { \ 281 return _yd; \ 282 } /* else launch accurate phase */ \ 283 } \ 284 else{ /* RN was RD, */ \ 285 if( _bits != (0x000&(_mask)) ) { \ 286 _y_double++; /* beware, does not work for -0 */ \ 287 return (double) _Asm_setf(2/*_FR_D*/, _y_double); \ 288 } /* else launch accurate phase */ \ 289 } \ 290 } \ 291 } 292 293 294 295 296 #define DE_TEST_AND_RETURN_RD(_y, _mask) \ 297 { \ 298 db_ext_number _z; double _yd; \ 299 unsigned long int _mantissa, _y_double, _isNegative ,_wasRoundedUp, _bits; \ 300 _yd=(double) _y; \ 301 _mantissa = _Asm_getf(4/*_FR_SIG*/, _y); \ 302 _y_double = _Asm_getf(2/*_FR_D*/, _yd); \ 303 _bits = _mantissa & (0x7ff&(_mask)); \ 304 _wasRoundedUp = ((_mantissa >>11) & 1) != (_y_double & 1); \ 305 _bits = _mantissa & (0x7ff&(_mask)); \ 306 _isNegative = _y_double >> 63; \ 307 if(_isNegative) { \ 308 if(_wasRoundedUp) { /* RN was RD */ \ 309 if( _bits != (0x7ff&(_mask)) ) { \ 310 return (double) _y; \ 311 } /* else launch accurate phase */ \ 312 } \ 313 else{ /* RN was RU */ \ 314 if( _bits != (0x000&(_mask)) ) { \ 315 _y_double++; \ 316 return (double) _Asm_setf(2/*_FR_D*/, _y_double); \ 317 } /* else launch accurate phase */ \ 318 } \ 319 } \ 320 else{ /* Positive number */ \ 321 if(_wasRoundedUp) { /* RN was RU */ \ 322 if( _bits != (0x7ff&(_mask)) ) { \ 323 _y_double--; \ 324 return (double) _Asm_setf(2/*_FR_D*/, _y_double); \ 325 } /* else launch accurate phase */ \ 326 } \ 327 else{ /* RN was RD, */ \ 328 if( _bits != (0x000&(_mask)) ) { \ 329 return (double) _y; \ 330 } /* else launch accurate phase */ \ 331 } \ 332 } \ 333 } 334 335 #endif /* 0 */ 336 337 338 339 #endif /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */ 340 341 342 343 344 345 346 347 /**************************************************************************************/ 348 /************************Double double-extended arithmetic*****************************/ 349 /**************************************************************************************/ 350 351 352 353 #define Add12_ext(prh, prl, a, b) \ 354 { \ 355 double_ext _z, _a, _b; \ 356 _a = a; _b = b; \ 357 *prh = _a + _b; \ 358 _z = *prh - _a; \ 359 *prl = _b - _z; \ 360 } 361 362 363 364 365 #if (defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64)) 366 #define Mul12_ext(prh,prl,u,v) \ 367 { \ 368 const double_ext _c = 4294967297.L; /* 2^32 +1 */ \ 369 double_ext _up, _u1, _u2, _vp, _v1, _v2; \ 370 double_ext _u =u, _v=v; \ 371 \ 372 _up = _u*_c; _vp = _v*_c; \ 373 _u1 = (_u-_up)+_up; _v1 = (_v-_vp)+_vp; \ 374 _u2 = _u-_u1; _v2 = _v-_v1; \ 375 \ 376 *prh = _u*_v; \ 377 *prl = _u1*_v1 - *prh; \ 378 *prl = *prl + _u1*_v2; \ 379 *prl = *prl + _u2*_v1; \ 380 *prl = *prl + _u2*_v2; \ 381 } 382 383 #define Mul22_ext(prh,prl, ah,al, bh,bl) \ 384 { \ 385 double_ext mh, ml; \ 386 Mul12_ext(&mh,&ml,(ah),(bh)); \ 387 ml += (ah)*(bl) + (al)*(bh); \ 388 Add12_ext(prh,prl, mh,ml); \ 389 } 390 391 #define FMA22_ext(prh,prl, ah,al, bh,bl, ch,cl) \ 392 { \ 393 Mul22_ext(prh,prl, (ah),(al), (bh),(bl)); \ 394 Add22_ext(prh,prl, ch,cl, *prh, *prl); \ 395 } 396 397 398 399 #else /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */ 400 401 402 403 404 405 406 #define Mul12_ext( prh,prl, a, b ) \ 407 { \ 408 *prh = (a) * (b); \ 409 *prl = _Asm_fms( 3/*_PC_NONE*/, (a), (b), *prh, 1 ); \ 410 } 411 412 413 #if 0 /* transcription of Alexey's */ 414 #define Mul22_ext( prh,prl, ah,al, bh,bl ) \ 415 { \ 416 double_ext _t1,_t2,_t3; \ 417 *prh = (ah) * (bh); \ 418 _t1 = (ah)*(bl); \ 419 _t2 = _Asm_fms( 3/*_PC_NONE*/, (ah), (bh), *prh, 1 ); \ 420 _t3 = (al) * (bh) + _t1; \ 421 *prl = (_t2 + _t3); \ 422 } 423 #else 424 #define Mul22_ext( prh,prl, ah,al, bh,bl ) \ 425 { \ 426 double_ext ph, pl; \ 427 ph = (ah)*(bh); \ 428 pl = _Asm_fms( 3/*_PC_NONE*/, ah, bh, ph, 1/*_SF1*/ ); \ 429 pl = (ah)*(bl) + pl; \ 430 pl = (al)*(bh) + pl; \ 431 Add12_ext(prh,prl, ph,pl); \ 432 } 433 #endif 434 435 /* res = a*b + c, assume |a*b| <= |c| 436 * res, a and b in X format 437 * c in L format 438 */ 439 440 #if 0 441 #define FMA22_ext(prh,prl, ah,al, bh,bl, ch,cl) \ 442 { \ 443 Mul22_ext(prh,prl, (ah),(al), (bh),(bl)); \ 444 Add22_ext(prh,prl, ch,cl, *prh, *prl); \ 445 } 446 #else 447 #define FMA22_ext( prh,prl, ah,al, bh,bl, ch,cl) \ 448 { \ 449 double_ext __xfmagxxx_r_hi__,__xfmagxxx_r_lo__, \ 450 __xfmagxxx_t1__,__xfmagxxx_t2__, \ 451 __xfmagxxx_t3__,__xfmagxxx_t4__; \ 452 __xfmagxxx_r_hi__ = ah * bh + ch; \ 453 __xfmagxxx_t1__ = al * bh + cl; \ 454 __xfmagxxx_t2__ = __xfmagxxx_r_hi__ - ch; \ 455 __xfmagxxx_t3__ = ah * bl + __xfmagxxx_t1__; \ 456 __xfmagxxx_t4__ = _Asm_fms( 3/*_PC_NONE*/, ah, bh, __xfmagxxx_t2__, 1/*_SF1*/ ); \ 457 __xfmagxxx_r_lo__ = (__xfmagxxx_t3__ + __xfmagxxx_t4__); \ 458 *prh = __xfmagxxx_r_hi__; *prl = __xfmagxxx_r_lo__; \ 459 } 460 #endif 461 462 #endif /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */ 463 464 465 466 467 468 469 470 471 #define Div22_ext(prh,prl,xh,xl,yh,yl) \ 472 { \ 473 double_ext ch,cl,uh,ul; \ 474 ch = (xh)/(yh); \ 475 Mul12_ext(&uh,&ul,ch,(yh)); \ 476 cl = (xh)-uh; \ 477 cl = cl - ul; \ 478 cl = cl + (xl); \ 479 cl = cl - ch*(yl); \ 480 cl = cl / (yh); \ 481 Add12(prh,prl, ch, cl) ; \ 482 } 483 484 485 #define Add22_ext(prh,prl,xh,xl,yh,yl) \ 486 do { \ 487 double_ext _r,_s; \ 488 _r = (xh)+(yh); \ 489 _s = (xh)-_r; \ 490 _s = _s + (yh); \ 491 _s = _s + (yl); \ 492 _s = _s + (xl); \ 493 Add12_ext(prh,prl,_r,_s); \ 494 } while(0) 495 496 #endif /* ifndef __DOUBLE_EXTENDED_H*/ 497