1 #ifndef __DOUBLE_EXTENDED_H
2 #define __DOUBLE_EXTENDED_H
3 
4 #if (defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64))
5 #include <fpu_control.h>
6 typedef long double double_ext;
7 #endif
8 #if defined(CRLIBM_TYPECPU_ITANIUM) && defined(__INTEL_COMPILER)
9 typedef __fpreg double_ext;
10 #endif
11 
12 
13 /* For debugging */
14 typedef union {
15   int i[3];
16   long double d;
17 } db_ext_number;
18 
19 
20 #define DE_EXP 2
21 #define DE_MANTISSA_HI 1
22 #define DE_MANTISSA_LO 0
23 
24 #define print_ext(_s, _y) {\
25 db_ext_number _yy; _yy.d=_y; \
26 printf("%s %04x %08x %08x \n",_s, 0xffff&_yy.i[DE_EXP], _yy.i[DE_MANTISSA_HI], _yy.i[DE_MANTISSA_LO]);  \
27 }
28 
29 /**************************************************************************************/
30 /*********************************Rounding tests***************************************/
31 /**************************************************************************************/
32 
33 
34 /* These test work by observing the bits of your double-extended after the 53rd.
35 
36    mask should be  7ff   if you trust your 64 bits (hum)
37                    7fe   if you trust 63 (if you have proven that maxepsilon<2^(-63) )
38                    7fc                62
39                    7f8                61
40                    7f0                60   etc
41  */
42 
43 /* Mask constants for rounding test */
44 
45 #define ACCURATE_TO_64_BITS 0x7ff
46 #define ACCURATE_TO_63_BITS 0x7fe
47 #define ACCURATE_TO_62_BITS 0x7fc
48 #define ACCURATE_TO_61_BITS 0x7f8
49 #define ACCURATE_TO_60_BITS 0x7f0
50 #define ACCURATE_TO_59_BITS 0x7e0
51 
52 
53 #if (defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64))
54 
55 static const unsigned short RN_Double     = (_FPU_DEFAULT & ~_FPU_EXTENDED)|_FPU_DOUBLE;
56 static const unsigned short RN_DoubleDown = (_FPU_DEFAULT & ~_FPU_EXTENDED & ~_FPU_RC_NEAREST)|_FPU_DOUBLE | _FPU_RC_DOWN;
57 static const unsigned short RN_DoubleUp   = (_FPU_DEFAULT & ~_FPU_EXTENDED & ~_FPU_RC_NEAREST)|_FPU_DOUBLE | _FPU_RC_UP;
58 static const unsigned short RN_DoubleExt  = _FPU_DEFAULT;
59 
60 #define DOUBLE_EXTENDED_MODE  _FPU_SETCW(RN_DoubleExt)
61 #define DOUBLE_UP_MODE        _FPU_SETCW(RN_DoubleUp)
62 #define DOUBLE_DOWN_MODE      _FPU_SETCW(RN_DoubleDown)
63 #define BACK_TO_DOUBLE_MODE   _FPU_SETCW(RN_Double)
64 
65 
66 /*
67 Two rounding tests to the nearest.  On Pentium 3, gcc3.3, the second
68 is faster by 12 cycles (and also improves the worst-case time by 60
69 cycles since it doesn't switch processor rounding mode in this
70 case). However it uses a coarser error estimation.
71 */
72 
73 #define DE_TEST_AND_RETURN_RN_ZIV(y,rncst)  \
74 { double yh, yl;                            \
75   yh = (double) y;                          \
76   yl = y-yh;                                \
77   BACK_TO_DOUBLE_MODE;                      \
78   if(yh==yh + yl*rncst)   return yh;        \
79   DOUBLE_EXTENDED_MODE;                     \
80 }
81 
82 
83 #define DE_TEST_AND_RETURN_RN(_y, _mask)                    \
84 {                                                           \
85   db_ext_number _z;   double _yd;                           \
86   int _lo;                                                  \
87   _z.d = _y;                                                \
88   _yd = (double) _y;                                        \
89   _lo = _z.i[DE_MANTISSA_LO] &(_mask);                      \
90   if((_lo!=(0x3ff&(_mask))) && (_lo!= (0x400&(_mask)))) {   \
91     BACK_TO_DOUBLE_MODE;                                    \
92     return _yd;                                             \
93   }                                                         \
94 }
95 
96 
97 #define DE_TEST_AND_RETURN_RD(_y, _mask)                                        \
98 {                                                           			\
99   double _result; int _bits;                                                    \
100   db_ext_number _z;                                     			\
101   _z.d = _y;                                                			\
102   DOUBLE_DOWN_MODE;                                                             \
103   _bits = _z.i[DE_MANTISSA_LO] &(_mask);     			                \
104   _result = (double)(_y);	                                                \
105   if( (_bits != (0xfff&(_mask)))  && (_bits != (0x000&(_mask))) ) {             \
106     BACK_TO_DOUBLE_MODE;	                                                \
107     return _result;                                                             \
108     }                                                                           \
109   DOUBLE_EXTENDED_MODE;                                                         \
110 }
111 #define DE_TEST_AND_RETURN_RU(_y, _mask)                                        \
112 {                                                           			\
113   double _result; int _bits;                                                    \
114   db_ext_number _z;                                     			\
115   _z.d = _y;                                                			\
116   DOUBLE_UP_MODE;                                                               \
117   _bits = _z.i[DE_MANTISSA_LO] &(_mask);     			                \
118   _result = (double)(_y);	                                                \
119   if( (_bits != (0xfff&(_mask)))  && (_bits != (0x000&(_mask))) ) {             \
120     BACK_TO_DOUBLE_MODE;	                                                \
121     return _result;                                                             \
122     }                                                                           \
123   DOUBLE_EXTENDED_MODE;                                                         \
124 }
125 
126 
127 /* Use this one if you want a final computation step to overlap with
128    the rounding test. Examples: multiplication by a sign or by a power of 2 */
129 
130 #define DE_TEST_AND_RETURN_RN2(_ytest, _yreturn, _mask)      \
131 {                                                            \
132   db_ext_number _z;   double _y_return_d;                    \
133   int _bits;                                                 \
134   _z.d = _ytest;                                             \
135   _y_return_d = (double) (_yreturn);                         \
136   _bits = _z.i[DE_MANTISSA_LO] &(_mask);                     \
137   if((_bits!=(0x3ff&(_mask))) && (_bits!= (0x400&(_mask)))) {\
138     BACK_TO_DOUBLE_MODE;                                     \
139     return _y_return_d;                                      \
140   }                                                          \
141 }
142 
143 
144 
145 
146 /* Slow macros with two changes of precision, but who cares, they are
147    used at the end of the second step */
148 #define RETURN_SUM_ROUNDED_DOWN(_rh, _rl)   {\
149   double _result;        \
150   DOUBLE_DOWN_MODE;      \
151   _result = (_rh+_rl);	 \
152   BACK_TO_DOUBLE_MODE;	 \
153   return _result;        \
154 }
155 
156 #define RETURN_SUM_ROUNDED_UP(_rh, _rl)   {\
157   double _result;        \
158   DOUBLE_UP_MODE;        \
159   _result = (_rh+_rl);	 \
160   BACK_TO_DOUBLE_MODE;	 \
161   return _result;        \
162 }
163 
164 #else /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */
165 
166 
167 
168 
169 
170 #if !defined(CRLIBM_TYPECPU_ITANIUM)
171 #error "This file should be compiled only for IA32 or IA64 architecture "
172 #endif
173 
174 /* TODO Add what it takes to compile under HP-UX */
175 #if !defined(__INTEL_COMPILER)
176 #error "Use icc, version 8.1 or higher to compile for IA64 architecture"
177 #endif
178 
179 
180 #define DOUBLE_EXTENDED_MODE {}
181 #define BACK_TO_DOUBLE_MODE {}
182 
183 
184 
185 
186 
187 #define DE_TEST_AND_RETURN_RN(_y, _mask)                                     \
188 {   uint64_t _mantissa, _bits; double _yd;                                   \
189     _yd = _Asm_fma(2/*_FR_D*/, 1.0, _y, 0.0, 0/*SF0*/);                      \
190     _mantissa = _Asm_getf(4/*_FR_SIG*/, _y);                                 \
191     _bits =  _mantissa & (0x7ff&(_mask));                                    \
192     if(__builtin_expect(                                                     \
193          (_bits!=(0x3ff&(_mask)))  && (_bits != (0x400&(_mask))),            \
194          1+1==2))                                                            \
195       return _yd;                                                            \
196 }
197 
198 
199 /* Slower by 5 cycles as of 2005... Let us keep it, you never know */
200 #define DE_TEST_AND_RETURN_RN_ZIV(y,rncst)  \
201 { double yh, yl;                            \
202   yh = (double) y;                          \
203   yl = y-yh;                                \
204   if(__builtin_expect(yh == yh + yl*rncst, 1+1==2))   return yh;        \
205 }
206 
207 /* Use this one if you want a final computation step to overlap with
208    the rounding test. Examples: multiplication by a sign or by a power of 2 */
209 
210 #define DE_TEST_AND_RETURN_RN2(_ytest, _yreturn, _mask)                      \
211 {   uint64_t _mantissa, _bits;                                               \
212     _mantissa = _Asm_getf(4/*_FR_SIG*/, _ytest);                             \
213     _bits =  _mantissa & (0x7ff&(_mask));                                    \
214     if(__builtin_expect(                                                     \
215          (_bits!=(0x3ff&(_mask)))  && (_bits != (0x400&(_mask))),            \
216          1+1==2))                                                            \
217       return _yreturn;                                                       \
218 }
219 
220 
221 #define DE_TEST_AND_RETURN_RD(_y, _mask)                                     \
222 {   uint64_t _mantissa, _bits; double _yd;                                   \
223     _yd = _Asm_fma(2/*_FR_D*/, -1.0, _y, 0.0, 3/*SF3*/);                     \
224     _mantissa = _Asm_getf(4/*_FR_SIG*/, _y);                                 \
225     _bits =  _mantissa & (0x7ff&(_mask));                                    \
226     if(__builtin_expect(                                                     \
227          (_bits!=(0x000&(_mask)))  && (_bits != (0x7ff&(_mask))),            \
228          1+1==2))                                                            \
229       return -_yd;                                                           \
230 }
231 
232 #define DE_TEST_AND_RETURN_RU(_y, _mask)                                     \
233 {   uint64_t _mantissa, _bits; double _yd;                                   \
234     _yd = _Asm_fma(2/*_FR_D*/, 1.0, _y, 0.0, 3/*SF3*/);                      \
235     _mantissa = _Asm_getf(4/*_FR_SIG*/, _y);                                 \
236     _bits =  _mantissa & (0x7ff&(_mask));                                    \
237     if(__builtin_expect(                                                     \
238          (_bits!=(0x000&(_mask)))  && (_bits != (0x7ff&(_mask))),            \
239          1+1==2))                                                            \
240       return _yd;                                                            \
241 }
242 
243 #define RETURN_SUM_ROUNDED_DOWN(_rh, _rl) \
244    return -_Asm_fma(2/*_FR_D*/, -1.0, _rh, -_rl, 3/*SF3*/);
245 
246 #define RETURN_SUM_ROUNDED_UP(_rh, _rl) \
247    return _Asm_fma(2/*_FR_D*/, 1.0, _rh, _rl, 3/*SF3*/);
248 
249 
250 #if 0
251 
252 /* This test doesn'use SF2 and SF3  Kept as a model for Pentium implementation, to erase afterwards */
253 
254 #define DE_TEST_AND_RETURN_RU(_y, _mask)                                     \
255 {                                                           		     \
256   db_ext_number _z;   double    _yd;                        		     \
257   unsigned long int _mantissa, _y_double, _isNegative ,_wasRoundedUp, _bits; \
258   _yd=(double) _y;                                                           \
259   _mantissa = _Asm_getf(4/*_FR_SIG*/, _y);                                   \
260   _y_double = _Asm_getf(2/*_FR_D*/, _yd);                                    \
261   _bits =  _mantissa & (0x7ff&(_mask));                                      \
262   _wasRoundedUp = ((_mantissa >>11) & 1) != (_y_double & 1);       	     \
263   _bits =  _mantissa & (0x7ff&(_mask));                                      \
264   _isNegative = _y_double >> 63;                      		    	     \
265   if(_isNegative) {    							     \
266     if(_wasRoundedUp) { /* RN was RD */	                                     \
267       if( _bits != (0x7ff&(_mask)) ) {                          	     \
268         _y_double--;                                                         \
269         return (double) _Asm_setf(2/*_FR_D*/, _y_double);                    \
270       } /* else launch accurate phase */ 				     \
271     }									     \
272     else{ /* RN was RU, so need to check  */                    	     \
273       if( _bits != (0x000&(_mask)) ) {                          	     \
274         return    _yd;                                           	     \
275       } /* else launch accurate phase */ 				     \
276     }									     \
277   }									     \
278   else{ /* Positive number */                                                \
279     if(_wasRoundedUp) { /* RN was RU */                                      \
280       if( _bits != (0x7ff&(_mask)) ) {                          	     \
281         return  _yd;                                            	     \
282       } /* else launch accurate phase */ 				     \
283     }									     \
284     else{ /* RN was RD,  */                                                  \
285       if( _bits != (0x000&(_mask)) ) {                          	     \
286         _y_double++; /* beware, does not work for -0 */                      \
287         return (double) _Asm_setf(2/*_FR_D*/, _y_double);                    \
288       } /* else launch accurate phase */ 				     \
289     }                                                                        \
290   }									     \
291 }
292 
293 
294 
295 
296 #define DE_TEST_AND_RETURN_RD(_y, _mask)                                     \
297 {                                                           		     \
298   db_ext_number _z;   double    _yd;                        		     \
299   unsigned long int _mantissa, _y_double, _isNegative ,_wasRoundedUp, _bits; \
300   _yd=(double) _y;                                                           \
301   _mantissa = _Asm_getf(4/*_FR_SIG*/, _y);                                   \
302   _y_double = _Asm_getf(2/*_FR_D*/, _yd);                                    \
303   _bits =  _mantissa & (0x7ff&(_mask));                                      \
304   _wasRoundedUp = ((_mantissa >>11) & 1) != (_y_double & 1);       	     \
305   _bits =  _mantissa & (0x7ff&(_mask));                                      \
306   _isNegative = _y_double >> 63;                      		    	     \
307   if(_isNegative) {    							     \
308     if(_wasRoundedUp) { /* RN was RD */	                                     \
309       if( _bits != (0x7ff&(_mask)) ) {                          	     \
310         return (double) _y;                                           	     \
311       } /* else launch accurate phase */ 				     \
312     }									     \
313     else{ /* RN was RU  */                                       	     \
314       if( _bits != (0x000&(_mask)) ) {                          	     \
315         _y_double++;                                                         \
316         return (double) _Asm_setf(2/*_FR_D*/, _y_double);                    \
317       } /* else launch accurate phase */ 				     \
318     }									     \
319   }									     \
320   else{ /* Positive number */                                                \
321     if(_wasRoundedUp) { /* RN was RU */                                      \
322       if( _bits != (0x7ff&(_mask)) ) {                          	     \
323         _y_double--;                                                         \
324         return (double) _Asm_setf(2/*_FR_D*/, _y_double);                    \
325       } /* else launch accurate phase */ 				     \
326     }									     \
327     else{ /* RN was RD,  */                                                  \
328       if( _bits != (0x000&(_mask)) ) {                          	     \
329         return (double) _y;                                           	     \
330       } /* else launch accurate phase */ 				     \
331     }                                                                        \
332   }									     \
333 }
334 
335 #endif  /* 0 */
336 
337 
338 
339 #endif /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */
340 
341 
342 
343 
344 
345 
346 
347 /**************************************************************************************/
348 /************************Double double-extended arithmetic*****************************/
349 /**************************************************************************************/
350 
351 
352 
353 #define Add12_ext(prh, prl, a, b)       \
354 {                                       \
355   double_ext _z, _a, _b;                \
356   _a = a;   _b = b;                     \
357   *prh = _a + _b;                       \
358   _z = *prh - _a;                       \
359   *prl = _b - _z;                       \
360 }
361 
362 
363 
364 
365 #if (defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64))
366 #define Mul12_ext(prh,prl,u,v)                         \
367 {                                                      \
368   const double_ext _c  = 4294967297.L; /* 2^32 +1 */   \
369   double_ext _up, _u1, _u2, _vp, _v1, _v2;             \
370   double_ext _u =u, _v=v;                              \
371                                                        \
372   _up = _u*_c;         _vp = _v*_c;                    \
373   _u1 = (_u-_up)+_up;  _v1 = (_v-_vp)+_vp;             \
374   _u2 = _u-_u1;        _v2 = _v-_v1;                   \
375                                                        \
376   *prh = _u*_v;                                        \
377   *prl = _u1*_v1 - *prh;                               \
378   *prl = *prl + _u1*_v2;                               \
379   *prl = *prl + _u2*_v1;                               \
380   *prl = *prl + _u2*_v2;                               \
381 }
382 
383 #define Mul22_ext(prh,prl, ah,al, bh,bl)               \
384 {                                                      \
385   double_ext mh, ml;                                  \
386   Mul12_ext(&mh,&ml,(ah),(bh));		               \
387   ml += (ah)*(bl) + (al)*(bh);			       \
388   Add12_ext(prh,prl, mh,ml);                           \
389 }
390 
391 #define FMA22_ext(prh,prl, ah,al, bh,bl, ch,cl)        \
392 {                                                      \
393   Mul22_ext(prh,prl, (ah),(al), (bh),(bl));            \
394   Add22_ext(prh,prl, ch,cl, *prh, *prl);               \
395 }
396 
397 
398 
399 #else  /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */
400 
401 
402 
403 
404 
405 
406 #define Mul12_ext( prh,prl, a, b )                              \
407     {                                                           \
408       *prh = (a) * (b);                                         \
409       *prl = _Asm_fms( 3/*_PC_NONE*/, (a), (b), *prh, 1 );      \
410     }
411 
412 
413 #if 0 /* transcription of Alexey's */
414 #define Mul22_ext( prh,prl, ah,al, bh,bl ) \
415     {                                                            \
416         double_ext _t1,_t2,_t3;                                  \
417         *prh = (ah) * (bh);                                      \
418         _t1 = (ah)*(bl);                                         \
419         _t2 = _Asm_fms( 3/*_PC_NONE*/, (ah), (bh), *prh, 1 );    \
420         _t3 = (al) * (bh) + _t1;                                 \
421         *prl = (_t2 + _t3);                                      \
422     }
423 #else
424 #define Mul22_ext( prh,prl, ah,al, bh,bl )                \
425 {                                                         \
426   double_ext ph, pl;                                      \
427   ph = (ah)*(bh);                                         \
428   pl = _Asm_fms( 3/*_PC_NONE*/, ah, bh, ph, 1/*_SF1*/ );  \
429   pl = (ah)*(bl) + pl;                                    \
430   pl = (al)*(bh) + pl;                                    \
431   Add12_ext(prh,prl, ph,pl);                              \
432 }
433 #endif
434 
435 /* res = a*b + c, assume |a*b| <= |c|
436  *   res, a and b in X format
437  *   c in L format
438  */
439 
440 #if 0
441 #define FMA22_ext(prh,prl, ah,al, bh,bl, ch,cl)        \
442 {                                                      \
443   Mul22_ext(prh,prl, (ah),(al), (bh),(bl));            \
444   Add22_ext(prh,prl, ch,cl, *prh, *prl);               \
445 }
446 #else
447 #define FMA22_ext( prh,prl, ah,al,  bh,bl, ch,cl) \
448     {                                                                                      \
449         double_ext __xfmagxxx_r_hi__,__xfmagxxx_r_lo__,                                    \
450                 __xfmagxxx_t1__,__xfmagxxx_t2__,                                           \
451                 __xfmagxxx_t3__,__xfmagxxx_t4__;                                           \
452         __xfmagxxx_r_hi__ = ah * bh + ch;                                                  \
453         __xfmagxxx_t1__ = al * bh + cl;                                                    \
454         __xfmagxxx_t2__ = __xfmagxxx_r_hi__ - ch;                                          \
455         __xfmagxxx_t3__ = ah * bl + __xfmagxxx_t1__;                                       \
456         __xfmagxxx_t4__ = _Asm_fms( 3/*_PC_NONE*/, ah, bh, __xfmagxxx_t2__, 1/*_SF1*/ );   \
457         __xfmagxxx_r_lo__ = (__xfmagxxx_t3__ + __xfmagxxx_t4__);                           \
458         *prh = __xfmagxxx_r_hi__; *prl = __xfmagxxx_r_lo__;                                \
459     }
460 #endif
461 
462 #endif    /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */
463 
464 
465 
466 
467 
468 
469 
470 
471 #define  Div22_ext(prh,prl,xh,xl,yh,yl)             \
472 {                                                   \
473   double_ext ch,cl,uh,ul;                           \
474   ch = (xh)/(yh);                                   \
475   Mul12_ext(&uh,&ul,ch,(yh));                       \
476   cl = (xh)-uh;                                     \
477   cl = cl - ul;                                     \
478   cl = cl + (xl);                                   \
479   cl = cl - ch*(yl);                                \
480   cl = cl / (yh);                                   \
481   Add12(prh,prl, ch, cl) ;                          \
482 }
483 
484 
485 #define Add22_ext(prh,prl,xh,xl,yh,yl)   \
486 do {                                     \
487   double_ext _r,_s;                      \
488   _r = (xh)+(yh);                        \
489   _s = (xh)-_r;                          \
490   _s = _s + (yh);                        \
491   _s = _s + (yl);                        \
492   _s = _s + (xl);                        \
493   Add12_ext(prh,prl,_r,_s);              \
494 } while(0)
495 
496 #endif /* ifndef __DOUBLE_EXTENDED_H*/
497