1 /**
2 * Variables and common functions shared by many functions
3 *
4 * This file is part of the crlibm library developed by the Arenaire
5 * project at Ecole Normale Superieure de Lyon
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20 */
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include "crlibm.h"
24 #include "crlibm_private.h"
25
26
27 /* I wish I could use C99 fenv.h, but as of 2004 it doesn't specify
28 anything about precision, only rounding direction. */
29
30 #ifdef HAVE_FENV_H
31 #include <fenv.h>
32 #endif
33
34 /* Tell the compiler that we're going to mess with FP status register */
35 #ifdef FENV_H
36 #pragma STDC FENV_ACCESS ON
37 #endif
38
39
40
41
42
43 /* TODO proper init and exit functions
44
45 - for Itanium, sf0 is set to RNDouble, sf1 to RNdoubleExt,
46 set sf2 and/or sf3 for
47 directed functions, one should be kept for saving the fpsr when
48 speculating, study operating systems)
49
50 - for PowerPC: nothing to do usually, however if for some reason the
51 CPU was not in the default state then crlibm won't work
52
53 */
54
55
56
57 /* An init function which sets FPU flags when needed */
crlibm_init()58 unsigned long long crlibm_init() {
59 #ifndef CRLIBM_TYPEOS_BSD
60 #if defined(CRLIBM_HAS_FPU_CONTROL) && (defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64))
61 unsigned short oldcw, cw;
62
63 #if 1 /* gcc */
64 /* save old state */
65 _FPU_GETCW(oldcw);
66 /* Set FPU flags to use double, not double extended,
67 with rounding to nearest */
68 cw = (_FPU_DEFAULT & ~_FPU_EXTENDED)|_FPU_DOUBLE;
69 _FPU_SETCW(cw);
70 return (unsigned long long) oldcw;
71 #else /* Sun Studio */
72 __asm__ ("movw $639, -22(%ebp)");
73 __asm__ ("fldcw -22(%ebp)");
74 #endif
75
76
77 #elif defined(CRLIBM_TYPECPU_ITANIUM)
78 /* On Itanium we assume that SF2 is used fo speculation, and use only SF3 */
79
80 unsigned long long int old_fpsr;
81
82 #if defined(__INTEL_COMPILER)
83 _Asm_fsetc( 0x00, 0x28, 3 /*_SF3*/ ); /* sf3 = round up, double-precision */
84
85 // _Asm_mov_to_ar(40,
86 // (old_fpsr & 0xfd000000FFFFFFFFULL) || ((0x18ULL<<32) + (0x28ULL<<45)) );
87 #elif defined(__GNUC__)
88 __asm__ ("fsetc.s3 0, 40\n");
89 #endif /* defined(__INTEL_COMPILER) */
90 old_fpsr = 0 ; /* TODO */
91 return old_fpsr;
92
93 #else
94 return 0;
95 #endif /* CRLIBM_TYPECPU_X86 || CRLIBM_TYPECPU_AMD64 */
96 #else
97 return 0;
98 #endif
99 }
100
101 /* An exit function which sets FPU flags to initial value */
crlibm_exit(unsigned long long int oldcw)102 void crlibm_exit(unsigned long long int oldcw) {
103 #ifndef CRLIBM_TYPEOS_BSD
104 #if defined(CRLIBM_HAS_FPU_CONTROL) && (defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64))
105 /* Set FPU flags to use double, not double extended,
106 with rounding to nearest */
107 unsigned short t = (unsigned short)oldcw;
108 _FPU_SETCW(t);
109 #endif
110 #endif
111 }
112
113
114
115
116 #if ADD22_AS_FUNCTIONS
117 /*
118 * computes double-double addition: zh+zl = xh+xl + yh+yl
119 * relative error is smaller than 2^-103
120 */
121
Add22Cond(double * zh,double * zl,double xh,double xl,double yh,double yl)122 void Add22Cond(double *zh, double *zl,
123 double xh, double xl, double yh, double yl)
124 {
125 double r,s;
126 r = xh+yh;
127 if ((ABS(xh)) > (ABS(yh)))
128 {s= ((((xh-r)+yh)+yl)+xl); }
129 else {s=((((yh-r)+xh)+xl)+yl);}
130 *zh = r+s;
131 *zl = r - (*zh) + s;
132 }
133
134 /*
135 * computes double-double addition: zh+zl = xh+xl + yh+yl
136 * knowing that xh>yh
137 * relative error is smaller than 2^-103
138 */
139
Add22(double * zh,double * zl,double xh,double xl,double yh,double yl)140 void Add22(double *zh, double *zl, double xh, double xl, double yh, double yl)
141 {
142 double r,s;
143
144 r = xh+yh;
145 s = xh-r+yh+yl+xl;
146 *zh = r+s;
147 *zl = r - (*zh) + s;
148 }
149
150 #endif /*ADD22_AS_FUNCTIONS*/
151
152
153
154 #if DEKKER_AS_FUNCTIONS && (!defined PROCESSOR_HAS_FMA)
155 /* else it is defined in crlibm_private.h */
156
157 /*
158 * computes rh and rl such that rh + rl = a * b with rh = a @* b exactly
159 * under the conditions : a < 2^970 et b < 2^970
160 */
Mul12(double * rh,double * rl,double u,double v)161 void Mul12(double *rh, double *rl, double u, double v){
162 const double c = 134217729.; /* 1+2^27 */
163 double up, u1, u2, vp, v1, v2;
164
165 up = u*c; vp = v*c;
166 u1 = (u-up)+up; v1 = (v-vp)+vp;
167 u2 = u-u1; v2 = v-v1;
168
169 *rh = u*v;
170 *rl = (((u1*v1-*rh)+(u1*v2))+(u2*v1))+(u2*v2);
171 }
172
173 /*
174 * Computes rh and rl such that rh + rl = a * b and rh = a @* b exactly
175 */
Mul12Cond(double * rh,double * rl,double a,double b)176 void Mul12Cond(double *rh, double *rl, double a, double b){
177 const double two_970 = 0.997920154767359905828186356518419283e292;
178 const double two_em53 = 0.11102230246251565404236316680908203125e-15;
179 const double two_e53 = 9007199254740992.;
180 double u, v;
181
182 if (a>two_970) u = a*two_em53;
183 else u = a;
184 if (b>two_970) v = b*two_em53;
185 else v = b;
186
187 Mul12(rh, rl, u, v);
188
189 if (a>two_970) {*rh *= two_e53; *rl *= two_e53;}
190 if (b>two_970) {*rh *= two_e53; *rl *= two_e53;}
191 }
192
193
194 /*
195 * computes double-double multiplication: zh+zl = (xh+xl) * (yh+yl)
196 * under the conditions : xh < 2^970 et xl < 2^970
197 * relative error is smaller than 2^-102
198 */
199
Mul22(double * zh,double * zl,double xh,double xl,double yh,double yl)200 void Mul22(double *zh, double *zl, double xh, double xl, double yh, double yl)
201 {
202 double mh, ml;
203
204 const double c = 134217729.; /* 0x41A00000, 0x02000000 */
205 double up, u1, u2, vp, v1, v2;
206
207 up = xh*c; vp = yh*c;
208 u1 = (xh-up)+up; v1 = (yh-vp)+vp;
209 u2 = xh-u1; v2 = yh-v1;
210
211 mh = xh*yh;
212 ml = (((u1*v1-mh)+(u1*v2))+(u2*v1))+(u2*v2);
213
214 ml += xh*yl + xl*yh;
215 *zh = mh+ml;
216 *zl = mh - (*zh) + ml;
217 }
218
219
220 /*
221 * computes double-double division: pzh+pzl = (xh+xl) / (yh+yl)
222 * relative error is smaller than 2^-104
223 */
224
Div22(double * pzh,double * pzl,double xh,double xl,double yh,double yl)225 void Div22(double* pzh, double* pzl, double xh, double xl, double yh, double yl){
226 double _ch,_cl,_uh,_ul;
227 _ch=(xh)/(yh); Mul12(&_uh,&_ul,_ch,(yh));
228 _cl=(((((xh)-_uh)-_ul)+(xl))-_ch*(yl))/(yh);
229 *pzh=_ch+_cl; *pzl=(_ch-(*pzh))+_cl;
230 }
231
232 #endif /* DEKKER_AS_FUNCTIONS && (!defined PROCESSOR_HAS_FMA) */
233
234
235 #if SQRT_AS_FUNCTIONS
236
237 /*
238 Computes sqrt(x) with a result in double-double precision
239 Should be provable to be exact to at least 100 bits.
240
241 Only handles the following special cases:
242 - x == 0
243 - subnormal x
244 The following cases are not handled:
245 - x < 0
246 - x = +/-Infty, NaN
247 */
sqrt12(double * resh,double * resl,double x)248 void sqrt12(double *resh, double *resl, double x) {
249 db_number xdb;
250 int E;
251 double m, r0, r1, r2, r3h, r3l, r4h, r4l, srtmh, srtml;
252 double r2PHr2h, r2PHr2l, r2Sqh, r2Sql;
253 double mMr2h, mMr2l, mMr2Ch, mMr2Cl;
254 double MHmMr2Ch, MHmMr2Cl;
255 double r3Sqh, r3Sql, mMr3Sqh, mMr3Sql;
256
257 /* Special case x = 0 */
258 if (x == 0) {
259 *resh = x;
260 *resl = 0;
261 } else {
262
263 E = 0;
264
265 /* Convert to integer format */
266 xdb.d = x;
267
268 /* Handle subnormal case */
269 if (xdb.i[HI] < 0x00100000) {
270 E = -52;
271 xdb.d *= ((db_number) ((double) SQRTTWO52)).d; /* make x a normal number */
272 }
273
274 /* Extract exponent E and mantissa m */
275 E += (xdb.i[HI]>>20)-1023;
276 xdb.i[HI] = (xdb.i[HI] & 0x000fffff) | 0x3ff00000;
277 m = xdb.d;
278
279 /* Make exponent even */
280 if (E & 0x00000001) {
281 E++;
282 m *= 0.5; /* Suppose now 1/2 <= m <= 2 */
283 }
284
285 /* Construct sqrt(2^E) = 2^(E/2) */
286 xdb.i[HI] = (E/2 + 1023) << 20;
287 xdb.i[LO] = 0;
288
289 /* Compute initial approximation to r = 1/sqrt(m) */
290
291 r0 = SQRTPOLYC0 + m * (SQRTPOLYC1 + m * (SQRTPOLYC2 + m * (SQRTPOLYC3 + m * SQRTPOLYC4)));
292
293 /* Iterate two times on double precision */
294
295 r1 = 0.5 * r0 * (3 - m * (r0 * r0));
296 r2 = 0.5 * r1 * (3 - m * (r1 * r1));
297
298 /* Iterate two times on double-double precision */
299
300 Mul12(&r2Sqh, &r2Sql, r2, r2); Add12(r2PHr2h, r2PHr2l, r2, 0.5 * r2);
301 Mul12(&mMr2h, &mMr2l, m, r2);
302 Mul22(&mMr2Ch, &mMr2Cl, mMr2h, mMr2l, r2Sqh, r2Sql);
303
304 MHmMr2Ch = -0.5 * mMr2Ch;
305 MHmMr2Cl = -0.5 * mMr2Cl;
306
307 Add22(&r3h, &r3l, r2PHr2h, r2PHr2l, MHmMr2Ch, MHmMr2Cl);
308
309 Mul22(&r3Sqh, &r3Sql, r3h, r3l, r3h, r3l);
310 Mul22(&mMr3Sqh, &mMr3Sql, m, 0, r3Sqh, r3Sql); /* To prove: mMr3Sqh = 1.0 in each case */
311
312 Mul22(&r4h, &r4l, r3h, r3l, 1, -0.5 * mMr3Sql);
313
314 /* Multiply obtained reciprocal square root by m */
315
316 Mul22(&srtmh,&srtml,m,0,r4h,r4l);
317
318 /* Multiply componentwise by sqrt(2^E), which is an integer power of 2 that may not produce a subnormal */
319
320 *resh = xdb.d * srtmh;
321 *resl = xdb.d * srtml;
322
323 } /* End: special case 0 */
324 }
325
326 #endif /* SQRT_AS_FUNCTIONS */
327
328
329
330
331
332 #if EVAL_PERF==1
333 /* counter of calls to the second step (accurate step) */
334 int crlibm_second_step_taken;
335 #endif
336
337 /* A debug functions */
338
printHexa(char * s,double x)339 void printHexa(char* s, double x) {
340 db_number xdb;
341
342 xdb.d = x;
343 printf("%s = %08x%08x (%1.8e) exponent = %d exponent of ulp = %d\n",
344 s,
345 xdb.i[HI],
346 xdb.i[LO],
347 x,
348 ((xdb.i[HI] & 0x7ff00000) >> 20) - 1023,
349 ((xdb.i[HI] & 0x7ff00000) >> 20) - 1023 - 52);
350 }
351
352
353
354
355 #ifdef SCS_TYPECPU_SPARC
356 const scs
357 /* 0 */
358 scs_zer ={{0x00000000, 0x00000000, 0x00000000, 0x00000000},
359 {{0, 0}}, 0, 1 },
360 /* 1/2 */
361 scs_half={{0x02000000, 0x00000000, 0x00000000, 0x00000000},
362 DB_ONE, -1, 1 },
363 /* 1 */
364 scs_one ={{0x00000001, 0x00000000, 0x00000000, 0x00000000},
365 DB_ONE, 0, 1 },
366 /* 2 */
367 scs_two ={{0x00000002, 0x00000000, 0x00000000, 0x00000000},
368 DB_ONE, 0, 1 },
369
370 /* ~1.666667e-01 */
371 scs_sixinv ={{0x0aaaaaaa, 0x2aaaaaaa, 0x2aaaaaaa, 0x2aaaaaaa},
372 DB_ONE, -1, 1 };
373
374 #else
375 const struct scs
376 /* 0 */
377 scs_zer ={{0x00000000, 0x00000000, 0x00000000, 0x00000000,
378 0x00000000, 0x00000000, 0x00000000, 0x00000000},
379 {{0, 0}}, 0, 1 },
380 /* 1/2 */
381 scs_half={{0x20000000, 0x00000000, 0x00000000, 0x00000000,
382 0x00000000, 0x00000000, 0x00000000, 0x00000000},
383 DB_ONE, -1, 1 },
384 /* 1 */
385 scs_one ={{0x00000001, 0x00000000, 0x00000000, 0x00000000,
386 0x00000000, 0x00000000, 0x00000000, 0x00000000},
387 DB_ONE, 0, 1 },
388 /* 2 */
389 scs_two ={{0x00000002, 0x00000000, 0x00000000, 0x00000000,
390 0x00000000, 0x00000000, 0x00000000, 0x00000000},
391 DB_ONE, 0, 1 },
392 /* 0.166666*/
393 scs_sixinv ={{0x0aaaaaaa, 0x2aaaaaaa, 0x2aaaaaaa, 0x2aaaaaaa,
394 0x2aaaaaaa, 0x2aaaaaaa, 0x2aaaaaaa, 0x2aaaaaaa},
395 DB_ONE, -1, 1 };
396
397 #endif
398