1 /******************************************************************************
2   Copyright (c) 2007-2011, Intel Corp.
3   All rights reserved.
4 
5   Redistribution and use in source and binary forms, with or without
6   modification, are permitted provided that the following conditions are met:
7 
8     * Redistributions of source code must retain the above copyright notice,
9       this list of conditions and the following disclaimer.
10     * Redistributions in binary form must reproduce the above copyright
11       notice, this list of conditions and the following disclaimer in the
12       documentation and/or other materials provided with the distribution.
13     * Neither the name of Intel Corporation nor the names of its contributors
14       may be used to endorse or promote products derived from this software
15       without specific prior written permission.
16 
17   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
27   THE POSSIBILITY OF SUCH DAMAGE.
28 ******************************************************************************/
29 
30 #if defined(FAST_SQRT) + defined(SQRT) + defined(RSQRT) + defined(MAKE_INCLUDE) != 1
31 #       error Exactly one of SQRT, FAST_SQRT, RSQRT, or MAKE_INCLUDE must be defined.
32 #endif
33 
34 #if defined(FAST_SQRT)
35 #	define __ENTRY_NAME F_FAST_SQRT_NAME
36 #	define ___BASE_NAME   FAST_SQRT_BASE_NAME
37 #	define IF_SQRT(x) x
38 #	define IF_RSQRT(x)
39 #elif defined(RSQRT)
40 #	define __ENTRY_NAME F_RSQRT_NAME
41 #	define ___BASE_NAME   RSQRT_BASE_NAME
42 #	define IF_SQRT(x)
43 #	define IF_RSQRT(x) x
44 #elif defined(SQRT)
45 #	define __ENTRY_NAME F_SQRT_NAME
46 #	define ___BASE_NAME   SQRT_BASE_NAME
47 #	define IF_SQRT(x) x
48 #	define IF_RSQRT(x)
49 #endif
50 
51 #if !defined(F_ENTRY_NAME)
52 #	define F_ENTRY_NAME __ENTRY_NAME
53 #endif
54 #if !defined(BASE_NAME)
55 #	define BASE_NAME ___BASE_NAME
56 #endif
57 
58 #if !defined(BUILD_FILE_EXTENSION)
59 #	define BUILD_FILE_EXTENSION c
60 #endif
61 
62 
63 #include "dpml_private.h"
64 
65 #if (DYNAMIC_ROUNDING_MODES) || (COMPILER == epc_cc)
66 #	define ESTABLISH_ROUND_TO_ZERO(old_mode) \
67 			INIT_FPU_STATE_AND_ROUND_TO_ZERO(old_mode)
68 #	define RESTORE_ROUNDING_MODE(old_mode) \
69 			RESTORE_FPU_STATE(old_mode)
70 #else
71 #	define ESTABLISH_ROUND_TO_ZERO(old_mode)
72 #	define RESTORE_ROUNDING_MODE(old_mode)
73 #endif
74 
75 
76 #if !defined(F_MUL_CHOPPED)
77 
78 	/* This definition of F_MUL_CHOPPED is used for dynamic
79 	rounding modes and when no directed rounding is available.
80 	In the later case results will not be correctly rounded.  */
81 
82 #	define F_MUL_CHOPPED(x,y,z) (z) = (x) * (y)
83 
84 #endif
85 
86 
87 /*
88 ** NUM_FRAC_BITS specifies the number of mantissa bits used for
89 ** indexing the table (the table index also includes the low-order
90 ** exponent bit).  NUM_FRAC_BITS also affects the table size:
91 **
92 **	sizeof(D_SQRT_TABLE_NAME) = (1 << (NUM_FRAC_BITS + 1))
93 **				    * (2*sizeof(float)+sizeof(double))
94 */
95 
96 #define NUM_FRAC_BITS 7
97 #define INDEX_MASK MAKE_MASK((NUM_FRAC_BITS + 1), 0)
98 
99 
100 #if (IEEE_FLOATING)
101 
102 /*
103 **	LOC_OF_EXPON is the bit offset within u.B_SIGNED_HI_32 of the
104 **	low-order exponent bit of u.f, where u is a B_UNION.  (We assume
105 **	the highest bits of B_SIGNED_HI_32 hold the sign bit and exponent).
106 **
107 **	From LOC_OF_EXPON, EXP_BITS_OF_ONE_HALF and HI_EXP_BIT_MASK are derived.
108 */
109 
110 #       define LOC_OF_EXPON ((BITS_PER_LS_INT_TYPE - 1) - B_EXP_WIDTH)
111 #       define EXP_BITS_OF_ONE_HALF  ((U_LS_INT_TYPE)(B_EXP_BIAS-B_NORM-1) << LOC_OF_EXPON)
112 #	define HI_EXP_BIT_MASK   (MAKE_MASK(B_EXP_WIDTH-1, 1) << LOC_OF_EXPON)
113 
114 #	define GET_SQRT_TABLE_INDEX(exp,index) \
115 		index = (exp >> (LOC_OF_EXPON - NUM_FRAC_BITS)); \
116 		index &= INDEX_MASK
117 
118 /*
119 **	SAVE_EXP saves the exponent in a temporary so it can be used in
120 **	the INPUT_IS_ABNORMAL macro
121 */
122 
123 #	define SAVE_EXP(exp) save_exp = (exp)
124 #	define INPUT_IS_ABNORMAL \
125 		((U_LS_INT_TYPE)(save_exp-((LS_INT_TYPE)1 << LOC_OF_EXPON)) >= \
126 			(U_LS_INT_TYPE)hi_exp_mask)
127 #endif
128 
129 #if (VAX_FLOATING)
130 
131 #	define EXP_BITS_OF_ONE_HALF 0x4000
132 #	define HI_EXP_BIT_MASK 0x7fe0
133 
134 #	define GET_SQRT_TABLE_INDEX(exp,index) \
135 		index = ((exp << 3) | ((U_INT_32)exp >> 29)); \
136 		index &= INDEX_MASK
137 
138 #	define SAVE_EXP(exp)	/* INPUT_IS_ABNORMAL doesn't need it */
139 #	define INPUT_IS_ABNORMAL (x <= (F_TYPE)0.0)
140 
141 #endif
142 
143 
144 #if ((ARCHITECTURE == alpha) || (BITS_PER_WORD == 64))
145 
146       /* We can do 64-bit stores */
147       /* This is an optimization of the 'else' clause below */
148 #     if QUAD_PRECISION
149 #	   define STORE_EXP_TO_V_UNION \
150 		V_UNION_128_BIT_STORE
151 #     else
152 #	   define STORE_EXP_TO_V_UNION \
153 		V_UNION_64_BIT_STORE
154 #     endif
155 
156 #else
157 
158       /* Store it in 32-bits pieces */
159 #     if QUAD_PRECISION
160 #	   define STORE_EXP_TO_V_UNION \
161 		v.B_SIGNED_HI_32 = ((U_INT_32)exp) >> 1; \
162 		v.B_SIGNED_LO1_32 = 0; \
163 		v.B_SIGNED_LO2_32 = 0; \
164 		v.B_SIGNED_LO3_32 = 0
165 #     else
166 #	   define STORE_EXP_TO_V_UNION \
167 		v.B_SIGNED_HI_32 = ((U_INT_32)exp) >> 1; \
168 		v.B_SIGNED_LO_32 = 0
169 #     endif
170 
171 #endif
172 
173 
174 /* This condition is complicated.  */
175 
176 #if (VAX_FLOATING) == (ENDIANESS == little_endian)
177 #	define V_UNION_64_BIT_STORE \
178 		v.B_UNSIGNED_HI_64 = ((U_INT_64)(U_INT_32)exp) >> 1
179 #	define V_UNION_128_BIT_STORE \
180 		v.B_UNSIGNED_HI_64 = ((U_INT_64)(U_INT_32)exp) >> 1; \
181                 v.B_UNSIGNED_LO_64 = 0
182 #elif ((ARCHITECTURE == alpha) && defined(HAS_LOAD_WRONG_STORE_SIZE_PENALTY))
183 #	define V_UNION_64_BIT_STORE \
184 		v.B_UNSIGNED_HI_64 = ((U_WORD)exp) >> 1
185 #	define V_UNION_128_BIT_STORE \
186 		v.B_UNSIGNED_HI_64 = ((U_WORD)exp) >> 1; \
187                 v.B_UNSIGNED_LO_64 = 0
188 #else
189 #       define V_UNION_64_BIT_STORE \
190                 v.B_UNSIGNED_HI_64 = ((U_INT_64)(U_INT_32)exp) << 31
191 #       define V_UNION_128_BIT_STORE \
192                 v.B_UNSIGNED_HI_64 = ((U_INT_64)(U_INT_32)exp) << 31; \
193                 v.B_UNSIGNED_LO_64 = 0
194 #endif
195 
196 
197 /*
198 ** The definitions of SQRT_COEF_STRUCT and D_SQRT_TABLE_NAME also
199 ** appear in the generated .c file for the table.
200 */
201 typedef struct {
202 	float a, b;
203 	double c;
204 } SQRT_COEF_STRUCT;
205 
206 extern const SQRT_COEF_STRUCT D_SQRT_TABLE_NAME[(1<<(NUM_FRAC_BITS+1))];
207 
208 
209 /*
210 **  SCALE_AND_DO_INDEXED_POLY_APPROX
211 **
212 **	Inputs:
213 **		x		any number
214 **				= f * 2^(2*i+j)
215 **			where	1/2 <= f < 1, integer i and j,
216 **				and j = 0 or 1
217 **			ignoring f <= 0
218 **
219 **	Outputs:
220 **		half_scale	= 2^(i-1)	(SQRT, F_SQRT)
221 **
222 **		flah_scale	= 2^(1-i)	(RSQRT)
223 **				(the name is clear, albeit cute)
224 **
225 **		scaled_x	= f * 2^j
226 **			so	1/2 <= scaled_x < 2
227 **
228 **		y		~= 1/sqrt(scaled_x)
229 **
230 **			so	sqrt(x) ~= y * scaled_x * 2 * half_scale
231 **			and	1/sqrt(x) ~= y / (2 * half_scale)
232 **
233 **	Temporaries:
234 **		u, a, b, c, index
235 */
236 
237 #define SCALE_AND_DO_INDEXED_POLY_APPROX \
238 	u.f = (B_TYPE)x; \
239 	exp = u.B_HI_LS_INT_TYPE; \
240 	B_COPY_SIGN_AND_EXP((B_TYPE)x, half, y); \
241 	ASSERT( ((0.5 <= y) && (y < 1.0)) ); \
242 	GET_SQRT_TABLE_INDEX(exp,index); \
243 	b = (B_TYPE)D_SQRT_TABLE_NAME[index].b; \
244 	b *= y;	 \
245 	c = (B_TYPE)D_SQRT_TABLE_NAME[index].c; \
246 	lo_exp_bit_and_hi_frac = exp & ~hi_exp_mask; \
247 	u.B_HI_LS_INT_TYPE = (exp_of_one_half | lo_exp_bit_and_hi_frac); \
248 	c += b; \
249 	scaled_x = u.f; \
250 	ASSERT( (((0.5 <= scaled_x) && (scaled_x < 2.0)) || (scaled_x < 0.0)) ); \
251 	y *= y; \
252 	a = (B_TYPE)D_SQRT_TABLE_NAME[index].a; \
253 	SAVE_EXP(exp); \
254 	IF_SQRT ({ \
255 	    exp ^= lo_exp_bit_and_hi_frac; \
256 	    exp += exp_of_one_half; \
257 	}) \
258 	IF_RSQRT({ \
259 	    exp ^= lo_exp_bit_and_hi_frac; \
260             exp = 3*exp_of_one_half - exp; \
261 	}) \
262 	y *= a; \
263 	STORE_EXP_TO_V_UNION; \
264 	y += c; \
265 	IF_SQRT ( half_scale = v.f ); \
266 	IF_RSQRT( flah_scale = v.f ); \
267 	/* end of SCALE_AND_DO_INDEXED_POLY_APPROX */
268 
269 
270 
271 /*----------------------------------------------------------------------------*/
272 /*                      Tuckerman's Rounding                                  */
273 /*----------------------------------------------------------------------------*/
274 
275 /*
276 ** Tuckerman's rounding is used to compute the correctly rounded sqrt(x).
277 ** It's 'good to the last bit', or more precisely 'to within 1/2 lsb(sqrt(x))'.
278 ** This is a short proof of Tuckerman's rounding.
279 **
280 ** Let z be a machine-precision approximation to sqrt(x); then z+lsb(z) is the
281 ** smallest representable number larger than z (NB: z-lsb(z) is the largest
282 ** representable number less than z, _except_ when z is a power of 2).
283 ** Within this proof, let [] represent _truncation_ to machine precision,
284 ** and {} represent _rounding_ to machine precision.
285 **
286 ** Note that for _any_ y (not necessarily representable in machine precision),
287 **
288 **      z + 1/2 lsb(z) <= y  <==>  z < {y}.
289 **
290 ** For sqrt(x), we never have equality:
291 **      z + 1/2 lsb(z) <= sqrt(x)  ==>  z + 1/2 lsb(z) < sqrt(x),
292 ** because if they were equal, we'd have:
293 **      (z + 1/2 lsb(z))^2 = x
294 ** which is impossible, because to represent the left hand side requires more
295 ** than twice the machine precision, while the right hand side is representable.
296 **
297 ** Now the following statements are equivalent in turn:
298 **
299 **              z < {sqrt(x)}
300 **              z + 1/2 lsb(z) <= sqrt(x)
301 **              z + 1/2 lsb(z) < sqrt(x)
302 **              (z +  1/2 lsb(z))^2 < x
303 **              z (z + 1/2 lsb(z)) < x          (the reverse is proved below)
304 **              [ z (z + 1/2 lsb(z)) ] < x.
305 **
306 ** To complete the reverse of the third inference above, suppose it were false.
307 ** Then: z (z + 1/2 lsb(z)) < x <= (z +  1/2 lsb(z))^2.  The left hand side is
308 ** some multiple of 1/2 lsb(z)^2.  The right hand side is only larger by
309 ** d = 1/4 lsb(z)^2, so [rhs] = [rhs-d] = [lhs].  But the inequality implies
310 ** [lhs] < x <= [rhs], and we have a contradiction.
311 **
312 ** In conclusion,
313 **              z < {sqrt(x)}  <==>  [ z (z + 1/2 lsb(z)) ] < x.
314 */
315 
316 /*
317 ** Here we cover another question:  How closely must y approximate sqrt(x) to
318 ** ensure {y} = {sqrt(x)}, where x is a representable number?  We state without
319 ** proof that the closest sqrt(x) approaches a value halfway between consecutive
320 ** representable numbers occurs either when x is just larger than a power of 4,
321 ** or just less than a power of 4.  We have:
322 **
323 **	sqrt(4^k*(1+lsb( 1 ))) = 2^k*(1 + lsb( 1 )/2 - lsb( 1 )^2/8 + ...), and
324 **	sqrt(4^k*(1-lsb(1/2))  = 2^k*(1 - lsb(1/2)/2 - lsb(1/2)^2/8 - ...).
325 **
326 ** So if |y - sqrt(x)| < lsb(sqrt(x))^2/8 - O(lsb^3), {y} = {sqrt(x)}.
327 ** For our purposes, this means that 50-bit accuracy (barely) suffices to
328 ** produce a correctly-rounded 24-bit result, since (2^(1-24))^2/8 = 2^(1-50).
329 ** After our Newton's iteration, we have nearly 53-bit accuracy.  All is well.
330 */
331 
332 /*----------------------------------------------------------------------------*/
333 /*			Computing 'x+' and 'x-'				      */
334 /*----------------------------------------------------------------------------*/
335 
336 /*
337 ** For Tuckerman's rounding, we need to compute the (machine-)representable
338 ** numbers just after and before a representable x: 'x+' = x + lsb(x) and
339 ** 'x-' = x - lsb(x-lsb(x)).  Letting '{}' denote rounding to machine precision,
340 ** we compute these by:
341 **
342 **	'x+' = {x + {c x}}			(1)
343 **	'x-' = {x - {c x}}			(2)
344 **
345 ** for some appropriate constant c, where neither x+{c x} nor x-{c x} are midway
346 ** between two consecutive representable numbers.
347 **
348 ** The weakest preconditions that satisfy the above are:
349 **
350 **	1/2 lsb(x) < {c x} < 3/2 lsb(x)		(1a), when x != 2^n(1-lsb(1/2))
351 **	1/2 lsb(x) < {c x} <  2  lsb(x)		(1b), when x = 2^n(1-lsb(1/2))
352 **	1/2 lsb(x) < {c x} < 3/2 lsb(x)		(2a), when x != 2^n
353 **	1/4 lsb(x) < {c x} < 3/4 lsb(x)		(2b), when x = 2^n
354 **
355 ** For (1a), (1b), and (2a), we can take:
356 **
357 **	1/2 lsb(x)/x < c < 3/2 lsb(x)/x, which we can 'shrink' to simplify:
358 ** 	1/2 lsb(1)/1 < c < 3/2 lsb(1)/2
359 **	1/2 lsb(1) < c < 3/4 lsb(1)
360 **
361 ** For (2b), we require:
362 **
363 **	1/4 lsb(1) < c < 3/4 lsb(1)
364 **
365 ** Thus, in any case, we can use any c in the range:
366 **
367 **	1/2 lsb(1) < c < 3/4 lsb(1)
368 **
369 ** We choose the midpoint:
370 **
371 **	c = 5/8 lsb(1) = 5/8 2^(1-p) = 5/4 2^(-p)
372 **
373 ** FWIW: It's possibly to compute 'x-' by:	'x-' = {x * (1-lsb(1/2))},
374 ** but 'x+' isn't necessarily computed by:	'x+' = {x * (1+lsb(1))}.
375 */
376 
377 #if defined(SQRT)
378 #   if (F_PRECISION == 24)
379 #	define ULP_FACTOR (F_TYPE)7.450580596923828125e-8
380 #   elif (F_PRECISION == 53)
381 #	define ULP_FACTOR (F_TYPE)1.387778780781445675529539585113525390625e-16
382 #   elif (F_PRECISION == 56)
383 #	define ULP_FACTOR (F_TYPE)1.7347234759768070944119244813919067382813e-17
384 #   elif (F_PRECISION == 113)
385 #	define ULP_FACTOR (F_TYPE)1.203706215242022408159986214115579574086314e-34
386 #   else
387 #	define ULP_FACTOR (F_TYPE)1.25/(F_POW_2(F_PRECISION))
388 #   endif
389 #endif
390 
391 
392 
393 /*----------------------------------------------------------------------------*/
394 /*			Newton's Iteration     				      */
395 /*----------------------------------------------------------------------------*/
396 
397 /*
398     Newton's iteration for 1 / (nth root of x) is:
399 
400 	y' = y + [ (1 - x * y^n) * y / n ]
401 
402     So, the iteration for 1 / sqrt(x) is:
403 
404 	y' = y + [ (1 - x * y^2) * y * 0.5 ]
405 
406     If we want to do one iteration, multiply the result by x,
407     and multiply the result by a scale factor we get:
408 
409 	y' = scale   * x     * ( y + [ (1 - x * y^2) * y * 0.5 ] )
410 	y' = scale   * x * y * ( 1 + [ (1 - x * y^2) * 0.5 ] )
411 	y' = scale/2 * x * y * ( 2 + [ (1 - x * y^2) ] )        gives about 5/4 lsb error
412 	y' = scale/2 * x * y * ( 3 - x * y^2 )			gives about 8/4 lsb error
413 
414     So iterate to get better 1/sqrt(x) and multiply by x to get sqrt(x).
415 */
416 
417 /*
418 **  For quad precision, we need additional Newton's iterations.
419 **  For lower precisions, the iteration (if needed) is embedded
420 **  in the ITERATE_AND_MAYBE_CHECK_LAST_BIT macro.
421 */
422 #if QUAD_PRECISION
423 
424 /*
425 **  NEWTONS_ITERATION
426 **
427 **	Inputs:
428 **		scaled_x	any number
429 **			ignoring scaled_x <= 0
430 **
431 **		y		~= 1/sqrt(scaled_x)
432 **
433 **	Outputs:
434 **		y		~= 1/sqrt(scaled_x)
435 **			y becomes a better approximation
436 **
437 **	Temporaries:
438 **		a, b, c
439 */
440 #       define NEWTONS_ITERATION \
441              a = y * scaled_x; \
442              b = a * y; \
443              b = one - b; \
444              b *= y; \
445              c = y + y; \
446              c += b; \
447              y = c * half
448 
449 #else
450 
451 #       define NEWTONS_ITERATION
452 
453 #endif
454 
455 
456 
457 /*----------------------------------------------------------------------------*/
458 /*			ITERATE_AND_MAYBE_CHECK_LAST_BIT		      */
459 /*----------------------------------------------------------------------------*/
460 
461 #if 0		/* To make all arms 'elif's */
462 #elif FAST_SQRT && (F_PRECISION <= 24)
463 
464 	/* Don't do a Newton's iteration */
465 
466 #	define ITERATE_AND_MAYBE_CHECK_LAST_BIT \
467 		a = y * scaled_x; \
468 		b = half_scale + half_scale; \
469 		f_type_y = (F_TYPE)(a * b)
470 
471 #	define RESULT f_type_y
472 
473 #elif RSQRT && (F_PRECISION <= 24)
474 
475 	/* Don't do a Newton's iteration */
476 
477 #	define ITERATE_AND_MAYBE_CHECK_LAST_BIT \
478 		b = flah_scale + flah_scale; \
479 		f_type_y = (F_TYPE)(y * b)
480 
481 #	define RESULT f_type_y
482 
483 #elif SQRT && (F_PRECISION <= 24) && (B_PRECISION < 2*F_PRECISION)
484 
485 	/* This case is unlikely enough that we will worry about it
486 	when we need to (if ever).  There is code in older versions of
487 	sqrt that does a tuckermans rounding on single prec values.  */
488 
489 #	error "We need to worry about it now."
490 
491 #elif SQRT && (F_PRECISION <= 24) && (B_PRECISION >= 2*F_PRECISION)
492 
493 	/* Make sure the last bit is correctly rounded by computing
494 	a double-precision result, and then rounding it to single.  */
495 
496 #	define ITERATE_AND_MAYBE_CHECK_LAST_BIT \
497 		a = y * scaled_x; \
498 		b = a * y; \
499 		c = a * half_scale; \
500 		b = three - b; \
501 		f_type_y = (F_TYPE)(c * b)
502 
503 #	define RESULT f_type_y
504 
505 #elif RSQRT
506 
507 	/* Do more accurate iteration (about 1 lsb error) */
508 
509 #	define ITERATE_AND_MAYBE_CHECK_LAST_BIT \
510 		c = y * flah_scale; \
511 		f_type_y = (F_TYPE)((c+c)+c*(one-scaled_x*(y*y)));
512 
513 #	define RESULT f_type_y
514 
515 #elif RSQRT
516 
517 	/* Do sloppy iteration (about 2 lsb error).
518 	y = (y * flah_scale) * (three - (y*scaled_x) * y) */
519 
520 #	define ITERATE_AND_MAYBE_CHECK_LAST_BIT \
521 		a = y * scaled_x; \
522 		b = a * y; \
523 		c = y * flah_scale; \
524 		b = three - b; \
525 		y = c * b
526 
527 #	define RESULT y
528 
529 #elif FAST_SQRT
530 
531 	/* Do sloppy iteration (about 2 lsb error).
532 	y = ((y*scaled_x) * half_scale) * (three - (y*scaled_x) * y) */
533 
534 #	define ITERATE_AND_MAYBE_CHECK_LAST_BIT \
535 		a = y * scaled_x; \
536 		b = a * y; \
537 		c = a * half_scale; \
538 		b = three - b; \
539 		y = c * b
540 
541 #	define RESULT y
542 
543 #elif SQRT
544 
545 	/* Do more accurate iteration and check last bit.
546 	[ NB: we compute ulp = 2*ULP_FACTOR*c, because y ~= 2*c.] */
547 
548 #	define DECLARE_old_mode U_WORD old_mode;
549 #	define DECLARE_ulp_stuff F_TYPE ulp, y_less_1_ulp, y_plus_1_ulp;
550 #	define ITERATE_AND_MAYBE_CHECK_LAST_BIT \
551 		a = y * scaled_x; \
552 		ulp = 2.0*ULP_FACTOR; \
553 		b = a * y; \
554 		c = a * half_scale; \
555 		b = one - b; \
556 		a = c + c; \
557 		b = c * b; \
558 		ulp *= c; \
559 		y = a + b; \
560 		y_less_1_ulp = y - ulp; \
561 		ASSERT( y_less_1_ulp < y ); \
562 		y_plus_1_ulp = y + ulp; \
563 		ASSERT( y_plus_1_ulp > y ); \
564 		ESTABLISH_ROUND_TO_ZERO(old_mode); \
565 		F_MUL_CHOPPED(y, y_less_1_ulp, a); \
566 		F_MUL_CHOPPED(y, y_plus_1_ulp, b); \
567 		RESTORE_ROUNDING_MODE(old_mode); \
568 		y = ((a >= x) ? y_less_1_ulp : y); \
569 		y = ((b <  x) ? y_plus_1_ulp : y);
570 
571 #	define RESULT y
572 
573 #else
574 
575 	error "Can't define ITERATE_AND_MAYBE_CHECK_LAST_BIT"
576 
577 #endif
578 
579 
580 #ifndef DECLARE_old_mode
581 #define DECLARE_old_mode
582 #endif
583 #ifndef DECLARE_ulp_stuff
584 #define DECLARE_ulp_stuff
585 #endif
586 
587 
588 
589 /*----------------------------------------------------------------------------*/
590 /*                      The Function Itself!                                  */
591 /*----------------------------------------------------------------------------*/
592 
593 
594 
595 F_TYPE F_ENTRY_NAME(F_TYPE x)
596 {
597         EXCEPTION_RECORD_DECLARATION
598 	B_UNION u, v;
599 
600 	F_TYPE f_type_y;
601 	B_TYPE y, a, b, c;
602 	B_TYPE scaled_x;
603 	B_TYPE IF_SQRT (half_scale)
604 	       IF_RSQRT(flah_scale);
605 	const B_TYPE half  = (B_TYPE)0.5;
606 	const B_TYPE one   = (B_TYPE)1.0;
607 	const B_TYPE three = (B_TYPE)3.0;
608 	DECLARE_old_mode
609 	DECLARE_ulp_stuff
610 
611 	LS_INT_TYPE   exp, save_exp;
612 	U_LS_INT_TYPE index;
613 	U_LS_INT_TYPE lo_exp_bit_and_hi_frac;
614 	U_LS_INT_TYPE hi_exp_mask = HI_EXP_BIT_MASK;
615 	U_LS_INT_TYPE exp_of_one_half = EXP_BITS_OF_ONE_HALF;
616 
617 #if defined(HAS_SQRT_INSTRUCTION) && ( FAST_SQRT || SQRT ) && ( SINGLE_PRECISION || DOUBLE_PRECISION )
618 	u.f = (B_TYPE)x;
619 	save_exp = u.B_HI_LS_INT_TYPE;
620 
621 	if INPUT_IS_ABNORMAL
622 		goto abnormal_input;
623 
624 	F_HW_SQRT(x,RESULT);
625 
626 	return RESULT;
627 #else
628 	SCALE_AND_DO_INDEXED_POLY_APPROX;
629 
630 	if INPUT_IS_ABNORMAL
631 		goto abnormal_input;
632 
633         NEWTONS_ITERATION;
634         NEWTONS_ITERATION;
635 
636 	ITERATE_AND_MAYBE_CHECK_LAST_BIT;
637 
638 	return RESULT;
639 #endif
640 
641 
642 abnormal_input:
643 
644 #if VAX_FLOATING
645 
646 	/* x is either 0 or negative   */
647 
648 	if (x == (F_TYPE)0.0) {
649 #if RSQRT
650 		GET_EXCEPTION_RESULT_1(RSQRT_OF_POS_ZERO, x, RESULT);
651 #else
652 		RESULT = x;
653 #endif
654 	} else {
655 		GET_EXCEPTION_RESULT_1(SQRT_OF_NEGATIVE, x, RESULT);
656 	}
657 	return RESULT;
658 
659 
660 #elif (IEEE_FLOATING)
661 
662 	F_CLASSIFY(x, index);
663 
664 	switch (index) {
665 
666             case F_C_SIG_NAN:
667             case F_C_QUIET_NAN:
668                RESULT = x;
669                return RESULT;
670                break;
671 
672 #if RSQRT
673 
674             case F_C_POS_INF:
675                RESULT = (F_TYPE)0.0;
676                return RESULT;
677                break;
678             case F_C_POS_ZERO:
679                GET_EXCEPTION_RESULT_1(RSQRT_OF_POS_ZERO, x, RESULT);
680                return RESULT;
681                break;
682             case F_C_NEG_ZERO:
683                GET_EXCEPTION_RESULT_1(RSQRT_OF_NEG_ZERO, x, RESULT);
684                return RESULT;
685                break;
686 
687 #else
688 
689             case F_C_POS_INF:
690             case F_C_POS_ZERO:
691             case F_C_NEG_ZERO:
692                RESULT = x;
693                return RESULT;
694                break;
695 
696 #endif
697 
698             case F_C_NEG_INF:
699             case F_C_NEG_NORM:
700             case F_C_NEG_DENORM:
701 		GET_EXCEPTION_RESULT_1(SQRT_OF_NEGATIVE, x, RESULT);
702                 return RESULT;
703                 break;
704 
705             default:
706 
707                 /* must be positive denorm */
708 
709 		F_MAKE_FLOAT(
710                    ((WORD) (2*F_PRECISION + 1) << F_EXP_POS), f_type_y);
711 		F_COPY_SIGN_AND_EXP(x, f_type_y, x);
712 		x -= f_type_y;
713 
714 #if defined(HAS_SQRT_INSTRUCTION) && ( FAST_SQRT || SQRT ) && ( SINGLE_PRECISION || DOUBLE_PRECISION )
715 		F_HW_SQRT(x,RESULT);
716 #else
717 		SCALE_AND_DO_INDEXED_POLY_APPROX;
718 
719                 NEWTONS_ITERATION;
720                 NEWTONS_ITERATION;
721 
722 		ITERATE_AND_MAYBE_CHECK_LAST_BIT;
723 #endif
724 
725 		/* Scale down again (up for RSQRT) */
726 
727 		IF_SQRT ( SUB_FROM_EXP_FIELD(RESULT, F_PRECISION) );
728 		IF_RSQRT(   ADD_TO_EXP_FIELD(RESULT, F_PRECISION) );
729 		return RESULT;
730                 break;
731 	}
732 
733 #endif
734 
735 }  /* sqrt */
736 
737 
738 
739 /*----------------------------------------------------------------------------*/
740 /*                      MPHOC code to generate the table                      */
741 /*----------------------------------------------------------------------------*/
742 
743 
744 #if MAKE_INCLUDE
745 
746 #undef  F_NAME_SUFFIX
747 #define F_NAME_SUFFIX TABLE_SUFFIX
748 
749 @divert divertText
750 
751 
752 	/*
753 	** Print header information.
754 	*/
755 	print;
756 	print "#include \"dpml_private.h\"";
757 	print;
758 	print "#define NUM_FRAC_BITS ", STR(NUM_FRAC_BITS);
759 	print;
760 	/*
761 	** The definitions of SQRT_COEF_STRUCT and D_SQRT_TABLE_NAME also
762 	** appear in the code.
763 	*/
764 	print "typedef struct {";
765 	print "	float a, b;";
766 	print "	double c;";
767 	print "} SQRT_COEF_STRUCT;";
768 	print;
769 	print "const SQRT_COEF_STRUCT D_SQRT_TABLE_NAME[(1<<(NUM_FRAC_BITS+1))] = {";
770 	print;
771 
772 /*
773 ** Generate and print the polynomial coefficients.
774 */
rsqrt_f(r)775 function rsqrt_f(r) { return 1/sqrt(r); }
776 
777 precision = ceil( (D_PRECISION + 16)/MP_RADIX_BITS );
778 
779 /*
780 **  For each half fo the table, ...
781 */
782 for (h = 1; h <= 2; h++) {
783 
784     xaa = 0.5;
785     xbb = 1.0;
786     xkk = 1.0/h;
787     print;
788     printf("/*\n**\t");
789     printf("a*x^2 + b*x + c");
790     printf(" ~= sqrt(%5r/x),\t\t%5r <= x < %5r", xkk, xaa, xbb);
791     printf("\n*/\n");
792 
793     for (i = 0; i < 2^NUM_FRAC_BITS; i++) {
794 	xa = xaa + (xbb-xaa) *   i  /2^NUM_FRAC_BITS;
795 	xb = xaa + (xbb-xaa) * (i+1)/2^NUM_FRAC_BITS;
796 	/*
797 	** Determine a minimum-error quadratic approximation to
798 	** sqrt(xkk/x) in the range xa <= x <= xb.  (This doesn't
799 	** minimize the error after a Newton's iteration; that'd
800 	** require a weighting function of x^(1/4), a needless
801 	** complication for this single-precision approximation).
802 	*/
803         tol = S_PRECISION+2;
804         flags = 0;
805         err = remes(flags, xa, xb, rsqrt_f, tol, &degree, &rsqrt_c);
806         if (degree != 2) print("*** degree = %i\n", degree);
807 	for (j = 0; j <= degree; j++)
808 	    rsqrt_c[j] = rsqrt_c[j] * sqrt(xkk);
809 	/*
810 	** Now round the x^2 and x coefficients to single precision,
811 	** by subtracting Chebyshev polynomials.  The additional error
812 	** is negligible (less than 3%; e.g., if the polynomial was good to
813 	** 27 bits, it's degraded to only 27-log2(1.03) = 26.96 bits).
814 	**
815 	** The algebra is simplified by expressing the range xa..xb in terms of
816 	** the range's midpoint and radius.
817 	*/
818 	xm = (xb + xa)/2;
819 	xr = (xb - xa)/2;
820 	z = xm / xr;
821 	/*
822 	** The Chebyshev polynomials we subtract are multiples of:
823 	**
824 	**	w	  <->	(x-xm)/xr
825 	**	1-2*w^2	  <->	1-2*((x-xm)/xr)^2
826 	**
827 	** The x terms are collected, scaled (by t), and subtracted from the
828 	** polynomial coefficients.
829 	**
830 	** First we subtract (a multiple of) the 2nd degree Chebyshev polynomial
831 	** to produce a new polynomial with the desired (representable in single
832 	** precision) 2nd degree polynomial coefficient.  This minimizes the
833 	** maximum absolute error between the 'Remes' polynomial and the new
834 	** polynomial (since the difference is a Chebyshev polynomial, which
835 	** has the 'equal ripple' property).
836 	**
837 	** Then we subtract (a multiple of) the 1st degree Chebyshev polynomial
838 	** to produce a new polynomial with the desired (representable in single
839 	** precision) 1st degree coefficient.  This minimizes the maximum
840 	** absolute error between the previous polynomial and the newer one
841 	** (under the constraints of having the same 2nd degree coefficient,
842 	** and the desired 1st degree coefficient).  The 0th degree coefficient
843 	** is rounded to double precision (somebody's got to!), and this has
844 	** no significant effect on the single precision result.
845 	**
846 	** Is the resulting polynomial optimal?  Nope; nobody claims it is.
847 	** Is it 'best' in some sense?  Yes -- the theory is clear and the code
848 	** is short (disregarding this phillipic).  Is it close enough?  Yep.
849 	** Why?  That's a good question....
850 	**
851 	** To see why this works, consider the polynomial for 1/sqrt(x) for
852 	** 1 <= x < 1+2^-7,
853 	**
854 	**	0.37... x^2 + -1.24... x + 1.87...
855 	**
856 	** Simply rounding the x coefficient to 24 bits may corrupt the result
857 	** of the polynomial by as much as (1+2^-7) * 0.5*s_lsb(1.24), where
858 	** s_lsb(z) = 2^floor(log2(|z|) + 1 - 24) is the value of z's least
859 	** significant bit when z is expressed in single precision.  This is
860 	** as much as 2^-24, which is 2*s_lsb(1/sqrt(x)) -- two single-precision
861 	** lsb of the result!  Rounding the x^2 coefficient has similar effects,
862 	** affecting the result by 1/2 single-precision lsb.  We can do better.
863 	**
864 	** If rounding increases the x coefficient by t, |t| <= 0.5*lsb(1.24),
865 	** the corruption can be partly compensated by adjusting the constant
866 	** coefficient, decreasing it by (for example) t*(1 + 1+2^-7)/2.
867 	** The corruption is then:
868 	**
869 	**	t*( x - (1+1+2^-7)/2 )
870 	**
871 	** Since 1 <= x < 1+2^-7, and |t| <= 0.5*lsb(1.24), we have:
872 	**
873 	**	| t*( x - (1+1+2^-7)/2 ) | <= 0.5*lsb(1.24) * 2^-8 = 2^(-24 -8)
874 	**
875 	** which is only 0.0078125*s_lsb(1/sqrt(x)) -- a factor of 256 smaller
876 	** than the corruption from simply rounding the x coefficient.
877 	**
878 	** To minimize the (absolute value of the) maximum corruption, we add
879 	** a multiple of a Chebyshev polynomial, for the particular range of x,
880 	** because Chebyshev polynomials are 'minimax' (or 'equal ripple')
881 	** polynomials.
882 	** For the range -1 <= w <= 1, the Chebyshev polynomials are:
883 	**
884 	**	1,  w,  2*w^2-1,  4*w^3-3*w,  ....
885 	**
886 	** To convert these to polynomials in x for the range a <= x <= b,
887 	** substitute (x-m)/r, with m = (b+a)/2, r = (b-a)/2, and z = m/r.
888 	** The Chebyshev polynomials become:
889 	**
890 	**	1, x/r - z, 2*(x/r)^2 - 4*z*(x/r) + 2*z^2-1,
891 	**	4*(x/r)^3 - 12*z*(x/r)^2 + (12*z^2-3)*(x/r) - 4*z^3+3*z, ....
892 	**
893 	** For 1 <= x < 1+2^-7, these are:
894 	**
895 	**	1, 2^8*x - (2^8+1), 2^17*x^2 - (2^18+2^10)*x + (2^17+2^10+1),
896 	**	2^26*x^3 - 3*(2^26+2^18)*x^2 + 3*(2^26+2^19+3*2^8)*x
897 	**			- (2^26+3*2^18+2^11+2^8+1), ....
898 	**
899 	** Each of these are 'equal ripple', oscillating between +/-1.  We see
900 	** our previous adjustment, ( x - (1+1+2^-7)/2 ), appear here with a
901 	** factor of 2^8.  Scaling it by t*2^-8 gives our previous result; this
902 	** scaling also reduces the 'ripple' to +/-t*2^-8.
903 	**
904 	** When we use the 2nd degree Chebyshev polynomial to round the 2nd
905 	** degree coefficient to single precision, we must scale the polynomial
906 	** by a factor of t*2^-17, where here |t| <= 0.5*lsb(0.37).  This means
907 	** that the effect of this corruption, the size of the 'ripple', is less
908 	** than 0.5*lsb(0.37)*2^-17 = 2^-43, or 2^-18*s_lsb(1/sqrt(x)).  This is
909 	** far better than the the 1/2 lsb we got when we simply rounded the x^2
910 	** coefficient.
911 	**
912 	** Can this technique be applied to other polynomial coefficients?
913 	** It is an invention of my own conception developed outside the term
914 	** of my contract, and for which I've received no compensation.
915 	*/
916 	t = rsqrt_c[2] - bround(rsqrt_c[2], S_PRECISION);
917 	rsqrt_c[2] = rsqrt_c[2] - t;
918 	rsqrt_c[1] = rsqrt_c[1] + t * 2*z * xr;
919 	rsqrt_c[0] = rsqrt_c[0] + t * (0.5-z^2) * xr^2;
920 	t = rsqrt_c[1] - bround(rsqrt_c[1], S_PRECISION);
921 	rsqrt_c[2] = rsqrt_c[2];
922 	rsqrt_c[1] = rsqrt_c[1] - t;
923 	rsqrt_c[0] = rsqrt_c[0] + t * z * xr;
924 	t = rsqrt_c[0] - bround(rsqrt_c[1], D_PRECISION);
925         printf("{\t%.10r,\t%.10r,\t%.20r\t},\n",
926 	    rsqrt_c[2], rsqrt_c[1], rsqrt_c[0]);
927     }
928 }
929 
930 	/*
931 	** Print the trailer.
932 	*/
933 	print;
934 	print "};";
935 	print;
936 
937 
938 @end_divert
939 @eval my $outText = MphocEval( GetStream( "divertText" ) );		\
940      my $headerText = GetHeaderText( STR(BUILD_FILE_NAME),		\
941                        "Double precision square root table", __FILE__);	\
942      print "$headerText\n\n$outText";
943 
944 
945 #endif  /* MAKE_INCLUDE */
946 
947 
948 /*----------------------------------------------------------------------------*/
949 /*                              Testing                                       */
950 /*----------------------------------------------------------------------------*/
951 
952 #if MAKE_MTC
953 
954 
955 @divert > dpml_sqrt.mtc
956 
957 
958 build default = "sqrt.a";
959 
960 function SINGLE_SQRT      = F_CHAR F_SQRT_NAME(F_CHAR.v.r);
961 function FAST_SINGLE_SQRT = F_CHAR F_FAST_SQRT_NAME(F_CHAR.v.r);
962 function DOUBLE_SQRT      = B_CHAR B_SQRT_NAME(B_CHAR.v.r);
963 function FAST_DOUBLE_SQRT = B_CHAR B_FAST_SQRT_NAME(B_CHAR.v.r);
964 function MP_SQRT          = void mp_sqrt(m.r.r, m.r.w);
965 
966 
967 
968 type SQRT_ACCURACY = accuracy
969 	error = lsb;
970 	stats = max;
971 	points = 1024;
972 ;
973 
974 
975 domain SINGLE_SQRT_DENORMS  = { [ 0.0 , 1e-37  ]:uniform:10001 } ;
976 domain DOUBLE_SQRT_DENORMS  = { [ 0.0 , 1e-307 ]:uniform:10001 } ;
977 domain SINGLE_SQRT_ACCURACY = { [ 0.0 , 17.0   ]:uniform:100001 } ;
978 domain DOUBLE_SQRT_ACCURACY = { [ 0.0 , 17.0   ]:uniform:100001 } ;
979 
980 domain SQRT_KEYPOINTS =
981 	lsb = 0.5; { 2.0 | der } { 5.0 | der } { 10.0 | der }
982 	lsb = 0.5; { MTC_POS_TINY | der } { MTC_POS_HUGE | der }
983 	{ 0.0 | 0.0 } { 1.0 | 1.0 } { MTC_NEG_ZERO | MTC_NEG_ZERO }
984 	{ MTC_POS_INFINITY | MTC_POS_INFINITY } { MTC_NAN | MTC_NAN }
985 ;
986 
987 domain FAST_SINGLE_SQRT_KEYPOINTS =
988 	lsb = 1.0; { 2.0 | der } { 5.0 | der } { 10.0 | der }
989 	lsb = 1.0; { MTC_POS_TINY | der } { MTC_POS_HUGE | der }
990 	{ 0.0 | 0.0 } { 1.0 | 1.0 } { MTC_NEG_ZERO | MTC_NEG_ZERO }
991 	{ MTC_POS_INFINITY | MTC_POS_INFINITY } { MTC_NAN | MTC_NAN }
992 ;
993 
994 domain FAST_DOUBLE_SQRT_KEYPOINTS =
995 	lsb = 2.0; { 2.0 | der } { 5.0 | der } { 10.0 | der }
996 	lsb = 2.0; { MTC_POS_TINY | der } { MTC_POS_HUGE | der }
997 	{ 0.0 | 0.0 } { 1.0 | 1.0 } { MTC_NEG_ZERO | MTC_NEG_ZERO }
998 	{ MTC_POS_INFINITY | MTC_POS_INFINITY } { MTC_NAN | MTC_NAN }
999 ;
1000 
1001 
1002 test sqrt_acc_sd =
1003 	type   = SQRT_ACCURACY;
1004 	domain = SINGLE_SQRT_ACCURACY;
1005 	function            = SINGLE_SQRT;
1006 	comparison_function = FAST_DOUBLE_SQRT;
1007 	output =
1008 		file = "sqrt_acc_sd.out";
1009 	;
1010 ;
1011 
1012 test sqrt_denorm_acc_sd =
1013 	type   = SQRT_ACCURACY;
1014 	domain = SINGLE_SQRT_DENORMS;
1015 	function            = SINGLE_SQRT;
1016 	comparison_function = FAST_DOUBLE_SQRT;
1017 	output =
1018 		file = "sqrt_denorm_acc_sd.out";
1019 	;
1020 ;
1021 
1022 test fast_sqrt_acc_sd =
1023 	type   = SQRT_ACCURACY;
1024 	domain = SINGLE_SQRT_ACCURACY;
1025 	function            = FAST_SINGLE_SQRT;
1026 	comparison_function = FAST_DOUBLE_SQRT;
1027 	output =
1028 		file = "fast_sqrt_acc_sd.out";
1029 	;
1030 ;
1031 
1032 
1033 test sqrt_acc_dm =
1034 	type   = SQRT_ACCURACY;
1035 	domain = DOUBLE_SQRT_ACCURACY;
1036  	function            = DOUBLE_SQRT;
1037 	comparison_function = MP_SQRT;
1038 	output =
1039 		file = "sqrt_acc_dm.out";
1040 	;
1041 ;
1042 
1043 test sqrt_denorm_acc_dm =
1044 	type   = SQRT_ACCURACY;
1045 	domain = DOUBLE_SQRT_DENORMS;
1046  	function            = DOUBLE_SQRT;
1047 	comparison_function = MP_SQRT;
1048 	output =
1049 		file = "sqrt_denorm_acc_dm.out";
1050 	;
1051 ;
1052 
1053 test fast_sqrt_acc_dm =
1054 	type   = SQRT_ACCURACY;
1055 	domain = DOUBLE_SQRT_ACCURACY;
1056  	function            = FAST_DOUBLE_SQRT;
1057 	comparison_function = MP_SQRT;
1058 	output =
1059 		file = "fast_sqrt_acc_dm.out";
1060 	;
1061 ;
1062 
1063 
1064 test sqrt_key_sd =
1065     type   = key_point;
1066     domain = SQRT_KEYPOINTS;
1067     function            = SINGLE_SQRT;
1068     comparison_function = DOUBLE_SQRT;
1069     output =
1070         file = "sqrt_key_sd.out"  ;
1071         style = verbose;
1072     ;
1073 ;
1074 
1075 test sqrt_key_dm =
1076     type   = key_point;
1077     domain = SQRT_KEYPOINTS;
1078     function            = DOUBLE_SQRT;
1079     comparison_function = MP_SQRT;
1080     output =
1081         file = "sqrt_key_dm.out"  ;
1082         style = verbose;
1083     ;
1084 ;
1085 
1086 test fast_sqrt_key_sd =
1087     type   = key_point;
1088     domain = FAST_SINGLE_SQRT_KEYPOINTS;
1089     function            = FAST_SINGLE_SQRT;
1090     comparison_function = FAST_DOUBLE_SQRT;
1091     output =
1092         file = "fast_sqrt_key_sd.out"  ;
1093         style = verbose;
1094     ;
1095 ;
1096 
1097 test fast_sqrt_key_dm =
1098     type   = key_point;
1099     domain = FAST_DOUBLE_SQRT_KEYPOINTS;
1100     function            = FAST_DOUBLE_SQRT;
1101     comparison_function = MP_SQRT;
1102     output =
1103         file = "fast_sqrt_key_dm.out"  ;
1104         style = verbose;
1105     ;
1106 ;
1107 
1108 
1109 @end_divert
1110 
1111 
1112 #endif  /* MAKE_MTC */
1113 
1114 
1115