1 /**
2  * Variables and common functions shared by many functions
3  *
4  * This file is part of the crlibm library developed by the Arenaire
5  * project at Ecole Normale Superieure de Lyon
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20  */
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include "crlibm.h"
24 #include "crlibm_private.h"
25 
26 
27 /* I wish I could use C99 fenv.h, but as of 2004 it doesn't specify
28    anything about precision, only rounding direction. */
29 
30 #ifdef HAVE_FENV_H
31 #include <fenv.h>
32 #endif
33 
34 /* Tell the compiler that we're going to mess with FP status register */
35 #ifdef FENV_H
36 #pragma STDC FENV_ACCESS ON
37 #endif
38 
39 
40 
41 
42 
43 /* TODO proper init and exit functions
44 
45 - for Itanium, sf0 is set   to RNDouble, sf1 to RNdoubleExt,
46  set sf2 and/or sf3 for
47  directed functions, one should be kept for saving the fpsr when
48  speculating, study operating systems)
49 
50 - for PowerPC: nothing to do usually, however if for some reason the
51   CPU was not in the default state then crlibm won't work
52 
53  */
54 
55 
56 
57 /* An init function which sets FPU flags when needed */
crlibm_init()58 unsigned long long crlibm_init() {
59 #ifndef CRLIBM_TYPEOS_BSD
60 #if defined(CRLIBM_HAS_FPU_CONTROL) && (defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64))
61   unsigned short oldcw, cw;
62 
63 #if 1 /* gcc */
64   /* save old state */
65   _FPU_GETCW(oldcw);
66   /* Set FPU flags to use double, not double extended,
67      with rounding to nearest */
68   cw = (_FPU_DEFAULT & ~_FPU_EXTENDED)|_FPU_DOUBLE;
69   _FPU_SETCW(cw);
70   return (unsigned long long) oldcw;
71 #else  /* Sun Studio  */
72   __asm__ ("movw    $639, -22(%ebp)");
73   __asm__ ("fldcw -22(%ebp)");
74 #endif
75 
76 
77 #elif defined(CRLIBM_TYPECPU_ITANIUM)
78   /* On Itanium we assume that SF2 is used fo speculation, and use only SF3 */
79 
80   unsigned long long int  old_fpsr;
81 
82 #if defined(__INTEL_COMPILER)
83   _Asm_fsetc( 0x00, 0x28, 3 /*_SF3*/ ); /* sf3 = round up, double-precision */
84 
85   //  _Asm_mov_to_ar(40,
86   //	 (old_fpsr & 0xfd000000FFFFFFFFULL) || ((0x18ULL<<32) + (0x28ULL<<45)) );
87 #elif defined(__GNUC__)
88   __asm__ ("fsetc.s3 0, 40\n");
89 #endif /* defined(__INTEL_COMPILER) */
90   old_fpsr = 0 ; /* TODO */
91   return old_fpsr;
92 
93 #else
94   return 0;
95 #endif /* CRLIBM_TYPECPU_X86 || CRLIBM_TYPECPU_AMD64 */
96 #else
97   return 0;
98 #endif
99 }
100 
101 /* An exit function which sets FPU flags to initial value */
crlibm_exit(unsigned long long int oldcw)102 void crlibm_exit(unsigned long long int oldcw) {
103 #ifndef CRLIBM_TYPEOS_BSD
104 #if defined(CRLIBM_HAS_FPU_CONTROL) && (defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64))
105   /* Set FPU flags to use double, not double extended,
106      with rounding to nearest */
107   unsigned short t = (unsigned short)oldcw;
108   _FPU_SETCW(t);
109 #endif
110 #endif
111 }
112 
113 
114 
115 
116 #if ADD22_AS_FUNCTIONS
117 /*
118  * computes double-double addition: zh+zl = xh+xl + yh+yl
119  * relative error is smaller than 2^-103
120  */
121 
Add22Cond(double * zh,double * zl,double xh,double xl,double yh,double yl)122 void Add22Cond(double *zh, double *zl,
123                double xh, double xl, double yh, double yl)
124 {
125   double r,s;
126   r = xh+yh;
127   if ((ABS(xh)) > (ABS(yh)))
128     {s=   ((((xh-r)+yh)+yl)+xl); }
129   else {s=((((yh-r)+xh)+xl)+yl);}
130   *zh = r+s;
131   *zl = r - (*zh) + s;
132 }
133 
134 /*
135  * computes double-double addition: zh+zl = xh+xl + yh+yl
136  * knowing that xh>yh
137  * relative error is smaller than 2^-103
138  */
139 
Add22(double * zh,double * zl,double xh,double xl,double yh,double yl)140 void Add22(double *zh, double *zl, double xh, double xl, double yh, double yl)
141 {
142 double r,s;
143 
144 r = xh+yh;
145 s = xh-r+yh+yl+xl;
146 *zh = r+s;
147 *zl = r - (*zh) + s;
148 }
149 
150 #endif /*ADD22_AS_FUNCTIONS*/
151 
152 
153 
154 #if  DEKKER_AS_FUNCTIONS && (!defined PROCESSOR_HAS_FMA)
155 /* else it is defined in crlibm_private.h */
156 
157 /*
158  * computes rh and rl such that rh + rl = a * b with rh = a @* b exactly
159  * under the conditions : a < 2^970 et b < 2^970
160  */
Mul12(double * rh,double * rl,double u,double v)161 void  Mul12(double *rh, double *rl, double u, double v){
162   const double c = 134217729.;   /*  1+2^27 */
163   double up, u1, u2, vp, v1, v2;
164 
165   up = u*c;        vp = v*c;
166   u1 = (u-up)+up;  v1 = (v-vp)+vp;
167   u2 = u-u1;       v2 = v-v1;
168 
169   *rh = u*v;
170   *rl = (((u1*v1-*rh)+(u1*v2))+(u2*v1))+(u2*v2);
171 }
172 
173 /*
174  * Computes rh and rl such that rh + rl = a * b and rh = a @* b exactly
175  */
Mul12Cond(double * rh,double * rl,double a,double b)176 void Mul12Cond(double *rh, double *rl, double a, double b){
177   const double two_970 = 0.997920154767359905828186356518419283e292;
178   const double two_em53 = 0.11102230246251565404236316680908203125e-15;
179   const double two_e53  = 9007199254740992.;
180   double u, v;
181 
182   if (a>two_970)  u = a*two_em53;
183   else            u = a;
184   if (b>two_970)  v = b*two_em53;
185   else            v = b;
186 
187   Mul12(rh, rl, u, v);
188 
189   if (a>two_970) {*rh *= two_e53; *rl *= two_e53;}
190   if (b>two_970) {*rh *= two_e53; *rl *= two_e53;}
191 }
192 
193 
194 /*
195  * computes double-double multiplication: zh+zl = (xh+xl) *  (yh+yl)
196  * under the conditions : xh < 2^970 et xl < 2^970
197  * relative error is smaller than 2^-102
198  */
199 
Mul22(double * zh,double * zl,double xh,double xl,double yh,double yl)200 void Mul22(double *zh, double *zl, double xh, double xl, double yh, double yl)
201 {
202 double mh, ml;
203 
204   const double c = 134217729.;                /* 0x41A00000, 0x02000000 */
205   double up, u1, u2, vp, v1, v2;
206 
207   up = xh*c;        vp = yh*c;
208   u1 = (xh-up)+up;  v1 = (yh-vp)+vp;
209   u2 = xh-u1;       v2 = yh-v1;
210 
211   mh = xh*yh;
212   ml = (((u1*v1-mh)+(u1*v2))+(u2*v1))+(u2*v2);
213 
214   ml += xh*yl + xl*yh;
215   *zh = mh+ml;
216   *zl = mh - (*zh) + ml;
217 }
218 
219 
220 /*
221  * computes double-double division: pzh+pzl = (xh+xl) /  (yh+yl)
222  * relative error is smaller than 2^-104
223  */
224 
Div22(double * pzh,double * pzl,double xh,double xl,double yh,double yl)225 void Div22(double* pzh, double* pzl, double xh, double xl, double yh, double yl){
226   double _ch,_cl,_uh,_ul;
227   _ch=(xh)/(yh);   Mul12(&_uh,&_ul,_ch,(yh));
228   _cl=(((((xh)-_uh)-_ul)+(xl))-_ch*(yl))/(yh);
229   *pzh=_ch+_cl;   *pzl=(_ch-(*pzh))+_cl;
230 }
231 
232 #endif /* DEKKER_AS_FUNCTIONS && (!defined PROCESSOR_HAS_FMA)  */
233 
234 
235 #if SQRT_AS_FUNCTIONS
236 
237 /*
238    Computes sqrt(x) with a result in double-double precision
239    Should be provable to be exact to at least 100 bits.
240 
241    Only handles the following special cases:
242    - x == 0
243    - subnormal x
244    The following cases are not handled:
245    - x < 0
246    - x = +/-Infty, NaN
247 */
sqrt12(double * resh,double * resl,double x)248 void sqrt12(double *resh, double *resl, double x) {
249   db_number xdb;
250   int E;
251   double m, r0, r1, r2, r3h, r3l, r4h, r4l, srtmh, srtml;
252   double r2PHr2h, r2PHr2l, r2Sqh, r2Sql;
253   double mMr2h, mMr2l, mMr2Ch, mMr2Cl;
254   double MHmMr2Ch, MHmMr2Cl;
255   double r3Sqh, r3Sql, mMr3Sqh, mMr3Sql;
256 
257   /* Special case x = 0 */
258   if (x == 0) {
259     *resh = x;
260     *resl = 0;
261   } else {
262 
263     E = 0;
264 
265     /* Convert to integer format */
266     xdb.d = x;
267 
268     /* Handle subnormal case */
269     if (xdb.i[HI] < 0x00100000) {
270       E = -52;
271       xdb.d *= ((db_number) ((double) SQRTTWO52)).d; 	  /* make x a normal number */
272     }
273 
274     /* Extract exponent E and mantissa m */
275     E += (xdb.i[HI]>>20)-1023;
276     xdb.i[HI] = (xdb.i[HI] & 0x000fffff) | 0x3ff00000;
277     m = xdb.d;
278 
279     /* Make exponent even */
280     if (E & 0x00000001) {
281       E++;
282       m *= 0.5;    /* Suppose now 1/2 <= m <= 2 */
283     }
284 
285     /* Construct sqrt(2^E) = 2^(E/2) */
286     xdb.i[HI] = (E/2 + 1023) << 20;
287     xdb.i[LO] = 0;
288 
289     /* Compute initial approximation to r = 1/sqrt(m) */
290 
291     r0 = SQRTPOLYC0 + m * (SQRTPOLYC1 + m * (SQRTPOLYC2 + m * (SQRTPOLYC3 + m * SQRTPOLYC4)));
292 
293     /* Iterate two times on double precision */
294 
295     r1 = 0.5 * r0 * (3 - m * (r0 * r0));
296     r2 = 0.5 * r1 * (3 - m * (r1 * r1));
297 
298     /* Iterate two times on double-double precision */
299 
300     Mul12(&r2Sqh, &r2Sql, r2, r2);    Add12(r2PHr2h, r2PHr2l, r2, 0.5 * r2);
301     Mul12(&mMr2h, &mMr2l, m, r2);
302     Mul22(&mMr2Ch, &mMr2Cl, mMr2h, mMr2l, r2Sqh, r2Sql);
303 
304     MHmMr2Ch = -0.5 * mMr2Ch;
305     MHmMr2Cl = -0.5 * mMr2Cl;
306 
307     Add22(&r3h, &r3l, r2PHr2h, r2PHr2l, MHmMr2Ch, MHmMr2Cl);
308 
309     Mul22(&r3Sqh, &r3Sql, r3h, r3l, r3h, r3l);
310     Mul22(&mMr3Sqh, &mMr3Sql, m, 0, r3Sqh, r3Sql);  /* To prove: mMr3Sqh = 1.0 in each case */
311 
312     Mul22(&r4h, &r4l, r3h, r3l, 1, -0.5 * mMr3Sql);
313 
314     /* Multiply obtained reciprocal square root by m */
315 
316     Mul22(&srtmh,&srtml,m,0,r4h,r4l);
317 
318     /* Multiply componentwise by sqrt(2^E), which is an integer power of 2 that may not produce a subnormal */
319 
320     *resh = xdb.d * srtmh;
321     *resl = xdb.d * srtml;
322 
323   } /* End: special case 0 */
324 }
325 
326 #endif /* SQRT_AS_FUNCTIONS */
327 
328 
329 
330 
331 
332 #if EVAL_PERF==1
333 /* counter of calls to the second step (accurate step) */
334 int crlibm_second_step_taken;
335 #endif
336 
337 /* A debug functions */
338 
printHexa(char * s,double x)339 void printHexa(char* s, double x) {
340   db_number xdb;
341 
342   xdb.d = x;
343   printf("%s = %08x%08x (%1.8e) exponent = %d exponent of ulp = %d\n",
344 	 s,
345 	 xdb.i[HI],
346 	 xdb.i[LO],
347 	 x,
348 	 ((xdb.i[HI] & 0x7ff00000) >> 20) - 1023,
349 	 ((xdb.i[HI] & 0x7ff00000) >> 20) - 1023 - 52);
350 }
351 
352 
353 
354 
355 #ifdef SCS_TYPECPU_SPARC
356  const scs
357 /* 0   */
358    scs_zer ={{0x00000000, 0x00000000, 0x00000000, 0x00000000},
359              {{0, 0}},  0,   1 },
360 /* 1/2 */
361    scs_half={{0x02000000, 0x00000000, 0x00000000, 0x00000000},
362              DB_ONE, -1,   1 },
363 /*  1  */
364    scs_one ={{0x00000001, 0x00000000, 0x00000000, 0x00000000},
365              DB_ONE,  0,   1 },
366 /*  2  */
367    scs_two ={{0x00000002, 0x00000000, 0x00000000, 0x00000000},
368              DB_ONE,  0,   1 },
369 
370 /* ~1.666667e-01 */
371    scs_sixinv ={{0x0aaaaaaa, 0x2aaaaaaa, 0x2aaaaaaa, 0x2aaaaaaa},
372 	     DB_ONE,  -1,   1 };
373 
374 #else
375  const struct scs
376 /* 0   */
377    scs_zer ={{0x00000000, 0x00000000, 0x00000000, 0x00000000,
378              0x00000000, 0x00000000, 0x00000000, 0x00000000},
379              {{0, 0}},  0,   1 },
380 /* 1/2 */
381    scs_half={{0x20000000, 0x00000000, 0x00000000, 0x00000000,
382              0x00000000, 0x00000000, 0x00000000, 0x00000000},
383              DB_ONE, -1,   1 },
384 /*  1  */
385    scs_one ={{0x00000001, 0x00000000, 0x00000000, 0x00000000,
386              0x00000000, 0x00000000, 0x00000000, 0x00000000},
387              DB_ONE,  0,   1 },
388 /*  2  */
389    scs_two ={{0x00000002, 0x00000000, 0x00000000, 0x00000000,
390              0x00000000, 0x00000000, 0x00000000, 0x00000000},
391              DB_ONE,  0,   1 },
392 /* 0.166666*/
393    scs_sixinv ={{0x0aaaaaaa, 0x2aaaaaaa, 0x2aaaaaaa, 0x2aaaaaaa,
394 	     0x2aaaaaaa, 0x2aaaaaaa, 0x2aaaaaaa, 0x2aaaaaaa},
395 	     DB_ONE,  -1,   1 };
396 
397 #endif
398