1 /******************************************************************************
2   Copyright (c) 2007-2011, Intel Corp.
3   All rights reserved.
4 
5   Redistribution and use in source and binary forms, with or without
6   modification, are permitted provided that the following conditions are met:
7 
8     * Redistributions of source code must retain the above copyright notice,
9       this list of conditions and the following disclaimer.
10     * Redistributions in binary form must reproduce the above copyright
11       notice, this list of conditions and the following disclaimer in the
12       documentation and/or other materials provided with the distribution.
13     * Neither the name of Intel Corporation nor the names of its contributors
14       may be used to endorse or promote products derived from this software
15       without specific prior written permission.
16 
17   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
27   THE POSSIBILITY OF SUCH DAMAGE.
28 ******************************************************************************/
29 
30 #ifndef SQRT_MACROS_H
31 #define SQRT_MACROS_H
32 
33 
34 /* None of these macros screen for abnormal inputs.
35 They all assume positive finite values.  */
36 
37 
38 #define NUM_FRAC_BITS       7
39 #define INDEX_MASK        MAKE_MASK((NUM_FRAC_BITS + 1), 0)
40 
41 #if (IEEE_FLOATING)
42 
43 #       define IF_IEEE_FLOATING(x) x
44 
45 #       define LOC_OF_EXPON ((BITS_PER_LS_INT_TYPE - 1) - B_EXP_WIDTH)
46 #       define EXP_BITS_OF_ONE_HALF  ((U_LS_INT_TYPE)(B_EXP_BIAS-B_NORM-1) << LOC_OF_EXPON)
47 #       define EXPON_MASK   MAKE_MASK(B_EXP_WIDTH, 0)
48 #	define HI_EXP_BIT_MASK   ((EXPON_MASK - 1) << LOC_OF_EXPON)
49 #	define GET_SQRT_TABLE_INDEX(exp,index) \
50 		index = (exp >> (LOC_OF_EXPON - NUM_FRAC_BITS)); \
51 		index &= INDEX_MASK
52 
53 #   if ((ARCHITECTURE == alpha) && defined(HAS_LOAD_WRONG_STORE_SIZE_PENALTY))
54 #       define V_UNION_64_BIT_STORE \
55                 v.B_UNSIGNED_HI_64 = ((U_WORD)exp) >> 1
56 #       define V_UNION_128_BIT_STORE \
57                 v.B_UNSIGNED_HI_64 = ((U_WORD)exp) >> 1; \
58                 v.B_UNSIGNED_LO_64 = 0
59 #   else
60 #	define V_UNION_64_BIT_STORE \
61 		v.B_UNSIGNED_HI_64 = ((U_INT_64)(U_INT_32)exp) << 31
62 #	define V_UNION_128_BIT_STORE \
63 		v.B_UNSIGNED_HI_64 = ((U_INT_64)(U_INT_32)exp) << 31; \
64                 v.B_UNSIGNED_LO_64 = 0
65 #   endif
66 
67 #else
68 
69 #       define IF_IEEE_FLOATING(x)
70 
71 #	define EXP_BITS_OF_ONE_HALF 0x4000
72 #	define HI_EXP_BIT_MASK 0x7fe0
73 #	define GET_SQRT_TABLE_INDEX(exp,index) \
74 		index = ((exp << 3) | ((U_INT_32)exp >> 29)); \
75 		index &= INDEX_MASK
76 #	define V_UNION_64_BIT_STORE \
77 		v.B_UNSIGNED_HI_64 = ((U_INT_64)(U_INT_32)exp) >> 1
78 #	define V_UNION_128_BIT_STORE \
79 		v.B_UNSIGNED_HI_64 = ((U_INT_64)(U_INT_32)exp) >> 1 ;\
80                 v.B_UNSIGNED_LO_64 = 0
81 
82 #endif
83 
84 #if (ARCHITECTURE == alpha) || (BITS_PER_WORD == 64)
85 #     if QUAD_PRECISION
86 #	   define STORE_EXP_TO_V_UNION \
87 		V_UNION_128_BIT_STORE
88 #     else
89 #	   define STORE_EXP_TO_V_UNION \
90 		V_UNION_64_BIT_STORE
91 #     endif
92 #else
93 #     if QUAD_PRECISION
94 #	   define STORE_EXP_TO_V_UNION \
95 		v.B_SIGNED_HI_32 = ((U_INT_32)exp) >> 1; \
96 		v.B_SIGNED_LO1_32 = 0;\
97 		v.B_SIGNED_LO2_32 = 0;\
98 		v.B_SIGNED_LO3_32 = 0
99 #     else
100 #	   define STORE_EXP_TO_V_UNION \
101 		v.B_SIGNED_HI_32 = ((U_INT_32)exp) >> 1; \
102 		v.B_SIGNED_LO_32 = 0
103 #     endif
104 #endif
105 
106 #if QUAD_PRECISION
107 #       define NEWTONS_ITERATION \
108              a = y * scaled_x; \
109              b = a * y; \
110              b = one - b; \
111              b *= y; \
112              c = y + y; \
113              c += b; \
114              y = c * half
115 
116 #else
117 #       define NEWTONS_ITERATION
118 #endif
119 
120 #if QUAD_PRECISION
121 #       define NEWTONS_ITERATION_NO_SCALE(input) \
122              a = y * (B_TYPE)(input); \
123              b = a * y; \
124              b = one - b; \
125              b *= y; \
126              c = y + y; \
127              c += b; \
128              y = c * half
129 
130 #else
131 #       define NEWTONS_ITERATION_NO_SCALE(input)
132 #endif
133 
134 
135 
136 
137 typedef struct { float a, b; double c; } SQRT_COEF_STRUCT;
138 extern const SQRT_COEF_STRUCT D_SQRT_TABLE_NAME[];
139 
140 
141 #define SQRT_FIRST_PART(input) \
142 	B_UNION u, v; \
143 	B_TYPE y, a, b, c; \
144 	B_TYPE scaled_x, half_scale; \
145 	B_TYPE half  = (B_TYPE)0.5; \
146 	B_TYPE one   = (B_TYPE)1.0; \
147 	B_TYPE three = (B_TYPE)3.0; \
148 	F_TYPE ulp, y_less_1_ulp, y_plus_1_ulp; \
149 	F_TYPE f_type_y, truncated_y, truncated_product; \
150         LS_INT_TYPE exp; \
151         U_LS_INT_TYPE orig_rounding_mode; \
152         U_LS_INT_TYPE index; \
153         U_LS_INT_TYPE lo_exp_bit_and_hi_frac; \
154         U_LS_INT_TYPE hi_exp_mask = HI_EXP_BIT_MASK; \
155         U_LS_INT_TYPE exp_of_one_half = EXP_BITS_OF_ONE_HALF; \
156 	u.f = (B_TYPE)(input); \
157 	exp = u.B_HI_LS_INT_TYPE; \
158 	B_COPY_SIGN_AND_EXP((B_TYPE)(input), half, y); \
159 	GET_SQRT_TABLE_INDEX(exp,index); \
160 	b = (B_TYPE)D_SQRT_TABLE_NAME[index].b; \
161 	b *= y;	 \
162 	c = (B_TYPE)D_SQRT_TABLE_NAME[index].c; \
163 	lo_exp_bit_and_hi_frac = exp & ~hi_exp_mask; \
164 	u.B_HI_LS_INT_TYPE = exp_of_one_half | lo_exp_bit_and_hi_frac; \
165 	c += b; \
166 	scaled_x = u.f; \
167 	y *= y; \
168 	a = (B_TYPE)D_SQRT_TABLE_NAME[index].a; \
169 	exp ^= lo_exp_bit_and_hi_frac; \
170 	exp += exp_of_one_half; \
171 	y *= a; \
172 	STORE_EXP_TO_V_UNION; \
173 	y += c; \
174 	half_scale = v.f
175 
176 
177 #define B_HALF_PREC_SQRT(input, result) { \
178 	SQRT_FIRST_PART(input); \
179 	a = scaled_x * y; \
180 	b = half_scale + half_scale; \
181 	y = a * b; \
182 	(result) = (B_TYPE)y; \
183 }
184 
185 #if (DYNAMIC_ROUNDING_MODES && !FAST_SQRT)
186 
187 #	define ESTABLISH_KNOWN_ROUNDING_MODE(old_mode) INIT_FPU_STATE_AND_ROUND_TO_ZERO(old_mode)
188 #	define RESTORE_ORIGINAL_ROUNDING_MODE(old_mode) RESTORE_FPU_STATE(old_mode)
189 
190 #else
191 
192 #	define ESTABLISH_KNOWN_ROUNDING_MODE(old_mode)
193 #	define RESTORE_ORIGINAL_ROUNDING_MODE(old_mode)
194 
195 #endif
196 
197 
198 #if !defined(F_MUL_CHOPPED)
199 #	define F_MUL_CHOPPED(x,y,z) (z) = (x) * (y)
200 #endif
201 
202 
203 #if ( (F_PRECISION == 24) && PRECISION_BACKUP_AVAILABLE )
204 
205 
206 	/* Make sure the last bit is correctly rounded by computing
207 	a double-precision result, and then rounding it to single.  */
208 
209 
210 #		define ITERATE_AND_MAYBE_CHECK_LAST_BIT(input) \
211 			a = y * scaled_x; \
212 			b = a * y; \
213 			c = a * half_scale; \
214 			b = three - b; \
215 			f_type_y = (F_TYPE)(c * b)
216 
217 #		define RESULT f_type_y
218 
219 
220 #else
221 
222 #   undef  ULP_FACTOR
223 #   if (F_PRECISION == 53)
224 
225 #	define ULP_FACTOR (F_TYPE)2.775557561562891351e-16 /* 1.25 * 2^(1 - F_PRECISION) */
226 
227 #   elif QUAD_PRECISION
228 
229 #       define ULP_FACTOR 1.9259299443872358530559779425849273185381e-34
230 
231 #   else
232 # 	error Unsupported F_PRECISION.
233 #   endif
234 
235 
236 
237 /* Newton's iteration for 1 / (nth root of x) is:
238 
239 	y' = y + [ (1 - x * y^n) * y / n ]
240 
241 So, the iteration for 1 / sqrt(x) is:
242 
243 	y' = y + [ (1 - x * y^2) * y * 0.5 ]
244 
245 If we want to do one iteration and multiply the result by x
246 and multiply the result by a scale factor we get:
247 
248 	y' = scale   * x     * ( y + [ (1 - x * y^2) * y * 0.5 ] )
249 	y' = scale   * x * y * ( 1 + [ (1 - x * y^2) * 0.5 ] )
250 	y' = scale/2 * x * y * ( 2 + [ (1 - x * y^2) ] )        gives about 5/4 lsb error
251 	y' = scale/2 * x * y * ( 3 - x * y^2 )					gives about 8/4 lsb error
252 
253 So iterate to get better 1/sqrt(x) and multiply by x to get sqrt(x).  */
254 
255 #		define ITERATE_AND_MAYBE_CHECK_LAST_BIT(input) \
256 			a = y * scaled_x; \
257 			ulp = ULP_FACTOR; \
258 			b = a * y; \
259 			c = a * half_scale; \
260 			b = one - b; \
261 			a = c + c; \
262 			b = c * b; \
263 			ulp *= c; \
264 			y = a + b; \
265 			y_less_1_ulp = y - ulp; \
266 			ASSERT( y_less_1_ulp < y ); \
267 			y_plus_1_ulp = y + ulp; \
268 			ASSERT( y_plus_1_ulp > y ); \
269 			ESTABLISH_KNOWN_ROUNDING_MODE(orig_rounding_mode); \
270 			F_MUL_CHOPPED(y, y_less_1_ulp, a); \
271 			F_MUL_CHOPPED(y, y_plus_1_ulp, b); \
272 			RESTORE_ORIGINAL_ROUNDING_MODE(orig_rounding_mode); \
273 			y = ((a >= input) ? y_less_1_ulp : y); \
274 			y = ((b <  input) ? y_plus_1_ulp : y); \
275 
276 #		define RESULT y
277 
278 #endif
279 
280 
281 #if (SINGLE_PRECISION)
282 
283 #	define F_SQRT(input, result) { \
284 		SQRT_FIRST_PART(input); \
285 		a = scaled_x * y; \
286 		b = half_scale + half_scale; \
287 		y = a * b; \
288 		(result) = (F_TYPE)y; \
289 	}
290 
291 #	define F_PRECISE_SQRT(input, result) { \
292 		SQRT_FIRST_PART(input); \
293 		NEWTONS_ITERATION; \
294 		NEWTONS_ITERATION; \
295 		ITERATE_AND_MAYBE_CHECK_LAST_BIT(input); \
296 		(result) = (F_TYPE) RESULT; \
297 	}
298 
299 #	define F_SQRT_2_LSB(input,result) F_SQRT(input,result)
300 
301 #	define F_SQRT_2_LSB_NO_SCALE_FINISH_ITERATION(input) \
302 		y *= (B_TYPE)(input) \
303 		y += y;
304 
305 #else
306 
307 #	define F_SQRT(input, result) { \
308 		SQRT_FIRST_PART(input); \
309                 NEWTONS_ITERATION;\
310                 NEWTONS_ITERATION;\
311 		a = scaled_x * y; \
312 		b = a * y; \
313 		c = a * half_scale; \
314 		b = one - b; \
315 		a = c + c; \
316 		b = c * b; \
317 		y = a + b; \
318 		(result) = (F_TYPE)y; \
319 	}
320 
321 #	define F_PRECISE_SQRT(input, result) { \
322 		SQRT_FIRST_PART(input); \
323                 NEWTONS_ITERATION;\
324                 NEWTONS_ITERATION;\
325 		ITERATE_AND_MAYBE_CHECK_LAST_BIT(input); \
326 		(result) = (F_TYPE) RESULT; \
327 	}
328 
329 #	define F_SQRT_2_LSB(input, result) { \
330 		SQRT_FIRST_PART(input); \
331                 NEWTONS_ITERATION; \
332                 NEWTONS_ITERATION;\
333 		a = scaled_x * y; \
334 		b = a * y; \
335 		c = a * half_scale; \
336 		b = three - b; \
337 		y = c * b; \
338 		(result) = (F_TYPE)y; \
339 	}
340 
341 #	define F_SQRT_2_LSB_NO_SCALE_FINISH_ITERATION(input) \
342                 NEWTONS_ITERATION_NO_SCALE(input);\
343                 NEWTONS_ITERATION_NO_SCALE(input);\
344 		a = (B_TYPE)(input) * y; \
345 		b = a * y; \
346 		b = three - b; \
347 		y = a * b
348 
349 #endif
350 
351 
352 /* The F_SQRT_2_LSB_NO_SCALE macro avoids most scaling (i.e. 0.5 <= input < 2.0).
353 The input for the polynomial is still scaled, however, because the
354 coefficients have a scale factor built into them.  */
355 
356 #define F_SQRT_2_LSB_NO_SCALE_TIMES_2(input, result) { \
357 	B_UNION u; \
358 	B_TYPE y, a, b, c; \
359 	B_TYPE half  = (B_TYPE)0.5; \
360 	B_TYPE one   = (B_TYPE)1.0; \
361 	B_TYPE three = (B_TYPE)3.0; \
362 	LS_INT_TYPE exp;  \
363 	U_LS_INT_TYPE index; \
364 	u.f = (B_TYPE)(input); \
365 	exp = u.B_HI_LS_INT_TYPE; \
366 	B_COPY_SIGN_AND_EXP((B_TYPE)(input), half, y); \
367 	GET_SQRT_TABLE_INDEX(exp,index); \
368 	b = (B_TYPE)D_SQRT_TABLE_NAME[index].b; \
369 	b *= y;	 \
370 	c = (B_TYPE)D_SQRT_TABLE_NAME[index].c; \
371 	c += b; \
372 	y *= y; \
373 	a = (B_TYPE)D_SQRT_TABLE_NAME[index].a; \
374 	y *= a; \
375 	y += c; \
376 	F_SQRT_2_LSB_NO_SCALE_FINISH_ITERATION(input); \
377 	(result) = (F_TYPE)y; \
378 }
379 
380 #endif  /* SQRT_MACROS_H */
381 
382 
383