1 /****************************************************************************** 2 Copyright (c) 2007-2011, Intel Corp. 3 All rights reserved. 4 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 this list of conditions and the following disclaimer. 10 * Redistributions in binary form must reproduce the above copyright 11 notice, this list of conditions and the following disclaimer in the 12 documentation and/or other materials provided with the distribution. 13 * Neither the name of Intel Corporation nor the names of its contributors 14 may be used to endorse or promote products derived from this software 15 without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 27 THE POSSIBILITY OF SUCH DAMAGE. 28 ******************************************************************************/ 29 30 #ifndef SQRT_MACROS_H 31 #define SQRT_MACROS_H 32 33 34 /* None of these macros screen for abnormal inputs. 35 They all assume positive finite values. */ 36 37 38 #define NUM_FRAC_BITS 7 39 #define INDEX_MASK MAKE_MASK((NUM_FRAC_BITS + 1), 0) 40 41 #if (IEEE_FLOATING) 42 43 # define IF_IEEE_FLOATING(x) x 44 45 # define LOC_OF_EXPON ((BITS_PER_LS_INT_TYPE - 1) - B_EXP_WIDTH) 46 # define EXP_BITS_OF_ONE_HALF ((U_LS_INT_TYPE)(B_EXP_BIAS-B_NORM-1) << LOC_OF_EXPON) 47 # define EXPON_MASK MAKE_MASK(B_EXP_WIDTH, 0) 48 # define HI_EXP_BIT_MASK ((EXPON_MASK - 1) << LOC_OF_EXPON) 49 # define GET_SQRT_TABLE_INDEX(exp,index) \ 50 index = (exp >> (LOC_OF_EXPON - NUM_FRAC_BITS)); \ 51 index &= INDEX_MASK 52 53 # if ((ARCHITECTURE == alpha) && defined(HAS_LOAD_WRONG_STORE_SIZE_PENALTY)) 54 # define V_UNION_64_BIT_STORE \ 55 v.B_UNSIGNED_HI_64 = ((U_WORD)exp) >> 1 56 # define V_UNION_128_BIT_STORE \ 57 v.B_UNSIGNED_HI_64 = ((U_WORD)exp) >> 1; \ 58 v.B_UNSIGNED_LO_64 = 0 59 # else 60 # define V_UNION_64_BIT_STORE \ 61 v.B_UNSIGNED_HI_64 = ((U_INT_64)(U_INT_32)exp) << 31 62 # define V_UNION_128_BIT_STORE \ 63 v.B_UNSIGNED_HI_64 = ((U_INT_64)(U_INT_32)exp) << 31; \ 64 v.B_UNSIGNED_LO_64 = 0 65 # endif 66 67 #else 68 69 # define IF_IEEE_FLOATING(x) 70 71 # define EXP_BITS_OF_ONE_HALF 0x4000 72 # define HI_EXP_BIT_MASK 0x7fe0 73 # define GET_SQRT_TABLE_INDEX(exp,index) \ 74 index = ((exp << 3) | ((U_INT_32)exp >> 29)); \ 75 index &= INDEX_MASK 76 # define V_UNION_64_BIT_STORE \ 77 v.B_UNSIGNED_HI_64 = ((U_INT_64)(U_INT_32)exp) >> 1 78 # define V_UNION_128_BIT_STORE \ 79 v.B_UNSIGNED_HI_64 = ((U_INT_64)(U_INT_32)exp) >> 1 ;\ 80 v.B_UNSIGNED_LO_64 = 0 81 82 #endif 83 84 #if (ARCHITECTURE == alpha) || (BITS_PER_WORD == 64) 85 # if QUAD_PRECISION 86 # define STORE_EXP_TO_V_UNION \ 87 V_UNION_128_BIT_STORE 88 # else 89 # define STORE_EXP_TO_V_UNION \ 90 V_UNION_64_BIT_STORE 91 # endif 92 #else 93 # if QUAD_PRECISION 94 # define STORE_EXP_TO_V_UNION \ 95 v.B_SIGNED_HI_32 = ((U_INT_32)exp) >> 1; \ 96 v.B_SIGNED_LO1_32 = 0;\ 97 v.B_SIGNED_LO2_32 = 0;\ 98 v.B_SIGNED_LO3_32 = 0 99 # else 100 # define STORE_EXP_TO_V_UNION \ 101 v.B_SIGNED_HI_32 = ((U_INT_32)exp) >> 1; \ 102 v.B_SIGNED_LO_32 = 0 103 # endif 104 #endif 105 106 #if QUAD_PRECISION 107 # define NEWTONS_ITERATION \ 108 a = y * scaled_x; \ 109 b = a * y; \ 110 b = one - b; \ 111 b *= y; \ 112 c = y + y; \ 113 c += b; \ 114 y = c * half 115 116 #else 117 # define NEWTONS_ITERATION 118 #endif 119 120 #if QUAD_PRECISION 121 # define NEWTONS_ITERATION_NO_SCALE(input) \ 122 a = y * (B_TYPE)(input); \ 123 b = a * y; \ 124 b = one - b; \ 125 b *= y; \ 126 c = y + y; \ 127 c += b; \ 128 y = c * half 129 130 #else 131 # define NEWTONS_ITERATION_NO_SCALE(input) 132 #endif 133 134 135 136 137 typedef struct { float a, b; double c; } SQRT_COEF_STRUCT; 138 extern const SQRT_COEF_STRUCT D_SQRT_TABLE_NAME[]; 139 140 141 #define SQRT_FIRST_PART(input) \ 142 B_UNION u, v; \ 143 B_TYPE y, a, b, c; \ 144 B_TYPE scaled_x, half_scale; \ 145 B_TYPE half = (B_TYPE)0.5; \ 146 B_TYPE one = (B_TYPE)1.0; \ 147 B_TYPE three = (B_TYPE)3.0; \ 148 F_TYPE ulp, y_less_1_ulp, y_plus_1_ulp; \ 149 F_TYPE f_type_y, truncated_y, truncated_product; \ 150 LS_INT_TYPE exp; \ 151 U_LS_INT_TYPE orig_rounding_mode; \ 152 U_LS_INT_TYPE index; \ 153 U_LS_INT_TYPE lo_exp_bit_and_hi_frac; \ 154 U_LS_INT_TYPE hi_exp_mask = HI_EXP_BIT_MASK; \ 155 U_LS_INT_TYPE exp_of_one_half = EXP_BITS_OF_ONE_HALF; \ 156 u.f = (B_TYPE)(input); \ 157 exp = u.B_HI_LS_INT_TYPE; \ 158 B_COPY_SIGN_AND_EXP((B_TYPE)(input), half, y); \ 159 GET_SQRT_TABLE_INDEX(exp,index); \ 160 b = (B_TYPE)D_SQRT_TABLE_NAME[index].b; \ 161 b *= y; \ 162 c = (B_TYPE)D_SQRT_TABLE_NAME[index].c; \ 163 lo_exp_bit_and_hi_frac = exp & ~hi_exp_mask; \ 164 u.B_HI_LS_INT_TYPE = exp_of_one_half | lo_exp_bit_and_hi_frac; \ 165 c += b; \ 166 scaled_x = u.f; \ 167 y *= y; \ 168 a = (B_TYPE)D_SQRT_TABLE_NAME[index].a; \ 169 exp ^= lo_exp_bit_and_hi_frac; \ 170 exp += exp_of_one_half; \ 171 y *= a; \ 172 STORE_EXP_TO_V_UNION; \ 173 y += c; \ 174 half_scale = v.f 175 176 177 #define B_HALF_PREC_SQRT(input, result) { \ 178 SQRT_FIRST_PART(input); \ 179 a = scaled_x * y; \ 180 b = half_scale + half_scale; \ 181 y = a * b; \ 182 (result) = (B_TYPE)y; \ 183 } 184 185 #if (DYNAMIC_ROUNDING_MODES && !FAST_SQRT) 186 187 # define ESTABLISH_KNOWN_ROUNDING_MODE(old_mode) INIT_FPU_STATE_AND_ROUND_TO_ZERO(old_mode) 188 # define RESTORE_ORIGINAL_ROUNDING_MODE(old_mode) RESTORE_FPU_STATE(old_mode) 189 190 #else 191 192 # define ESTABLISH_KNOWN_ROUNDING_MODE(old_mode) 193 # define RESTORE_ORIGINAL_ROUNDING_MODE(old_mode) 194 195 #endif 196 197 198 #if !defined(F_MUL_CHOPPED) 199 # define F_MUL_CHOPPED(x,y,z) (z) = (x) * (y) 200 #endif 201 202 203 #if ( (F_PRECISION == 24) && PRECISION_BACKUP_AVAILABLE ) 204 205 206 /* Make sure the last bit is correctly rounded by computing 207 a double-precision result, and then rounding it to single. */ 208 209 210 # define ITERATE_AND_MAYBE_CHECK_LAST_BIT(input) \ 211 a = y * scaled_x; \ 212 b = a * y; \ 213 c = a * half_scale; \ 214 b = three - b; \ 215 f_type_y = (F_TYPE)(c * b) 216 217 # define RESULT f_type_y 218 219 220 #else 221 222 # undef ULP_FACTOR 223 # if (F_PRECISION == 53) 224 225 # define ULP_FACTOR (F_TYPE)2.775557561562891351e-16 /* 1.25 * 2^(1 - F_PRECISION) */ 226 227 # elif QUAD_PRECISION 228 229 # define ULP_FACTOR 1.9259299443872358530559779425849273185381e-34 230 231 # else 232 # error Unsupported F_PRECISION. 233 # endif 234 235 236 237 /* Newton's iteration for 1 / (nth root of x) is: 238 239 y' = y + [ (1 - x * y^n) * y / n ] 240 241 So, the iteration for 1 / sqrt(x) is: 242 243 y' = y + [ (1 - x * y^2) * y * 0.5 ] 244 245 If we want to do one iteration and multiply the result by x 246 and multiply the result by a scale factor we get: 247 248 y' = scale * x * ( y + [ (1 - x * y^2) * y * 0.5 ] ) 249 y' = scale * x * y * ( 1 + [ (1 - x * y^2) * 0.5 ] ) 250 y' = scale/2 * x * y * ( 2 + [ (1 - x * y^2) ] ) gives about 5/4 lsb error 251 y' = scale/2 * x * y * ( 3 - x * y^2 ) gives about 8/4 lsb error 252 253 So iterate to get better 1/sqrt(x) and multiply by x to get sqrt(x). */ 254 255 # define ITERATE_AND_MAYBE_CHECK_LAST_BIT(input) \ 256 a = y * scaled_x; \ 257 ulp = ULP_FACTOR; \ 258 b = a * y; \ 259 c = a * half_scale; \ 260 b = one - b; \ 261 a = c + c; \ 262 b = c * b; \ 263 ulp *= c; \ 264 y = a + b; \ 265 y_less_1_ulp = y - ulp; \ 266 ASSERT( y_less_1_ulp < y ); \ 267 y_plus_1_ulp = y + ulp; \ 268 ASSERT( y_plus_1_ulp > y ); \ 269 ESTABLISH_KNOWN_ROUNDING_MODE(orig_rounding_mode); \ 270 F_MUL_CHOPPED(y, y_less_1_ulp, a); \ 271 F_MUL_CHOPPED(y, y_plus_1_ulp, b); \ 272 RESTORE_ORIGINAL_ROUNDING_MODE(orig_rounding_mode); \ 273 y = ((a >= input) ? y_less_1_ulp : y); \ 274 y = ((b < input) ? y_plus_1_ulp : y); \ 275 276 # define RESULT y 277 278 #endif 279 280 281 #if (SINGLE_PRECISION) 282 283 # define F_SQRT(input, result) { \ 284 SQRT_FIRST_PART(input); \ 285 a = scaled_x * y; \ 286 b = half_scale + half_scale; \ 287 y = a * b; \ 288 (result) = (F_TYPE)y; \ 289 } 290 291 # define F_PRECISE_SQRT(input, result) { \ 292 SQRT_FIRST_PART(input); \ 293 NEWTONS_ITERATION; \ 294 NEWTONS_ITERATION; \ 295 ITERATE_AND_MAYBE_CHECK_LAST_BIT(input); \ 296 (result) = (F_TYPE) RESULT; \ 297 } 298 299 # define F_SQRT_2_LSB(input,result) F_SQRT(input,result) 300 301 # define F_SQRT_2_LSB_NO_SCALE_FINISH_ITERATION(input) \ 302 y *= (B_TYPE)(input) \ 303 y += y; 304 305 #else 306 307 # define F_SQRT(input, result) { \ 308 SQRT_FIRST_PART(input); \ 309 NEWTONS_ITERATION;\ 310 NEWTONS_ITERATION;\ 311 a = scaled_x * y; \ 312 b = a * y; \ 313 c = a * half_scale; \ 314 b = one - b; \ 315 a = c + c; \ 316 b = c * b; \ 317 y = a + b; \ 318 (result) = (F_TYPE)y; \ 319 } 320 321 # define F_PRECISE_SQRT(input, result) { \ 322 SQRT_FIRST_PART(input); \ 323 NEWTONS_ITERATION;\ 324 NEWTONS_ITERATION;\ 325 ITERATE_AND_MAYBE_CHECK_LAST_BIT(input); \ 326 (result) = (F_TYPE) RESULT; \ 327 } 328 329 # define F_SQRT_2_LSB(input, result) { \ 330 SQRT_FIRST_PART(input); \ 331 NEWTONS_ITERATION; \ 332 NEWTONS_ITERATION;\ 333 a = scaled_x * y; \ 334 b = a * y; \ 335 c = a * half_scale; \ 336 b = three - b; \ 337 y = c * b; \ 338 (result) = (F_TYPE)y; \ 339 } 340 341 # define F_SQRT_2_LSB_NO_SCALE_FINISH_ITERATION(input) \ 342 NEWTONS_ITERATION_NO_SCALE(input);\ 343 NEWTONS_ITERATION_NO_SCALE(input);\ 344 a = (B_TYPE)(input) * y; \ 345 b = a * y; \ 346 b = three - b; \ 347 y = a * b 348 349 #endif 350 351 352 /* The F_SQRT_2_LSB_NO_SCALE macro avoids most scaling (i.e. 0.5 <= input < 2.0). 353 The input for the polynomial is still scaled, however, because the 354 coefficients have a scale factor built into them. */ 355 356 #define F_SQRT_2_LSB_NO_SCALE_TIMES_2(input, result) { \ 357 B_UNION u; \ 358 B_TYPE y, a, b, c; \ 359 B_TYPE half = (B_TYPE)0.5; \ 360 B_TYPE one = (B_TYPE)1.0; \ 361 B_TYPE three = (B_TYPE)3.0; \ 362 LS_INT_TYPE exp; \ 363 U_LS_INT_TYPE index; \ 364 u.f = (B_TYPE)(input); \ 365 exp = u.B_HI_LS_INT_TYPE; \ 366 B_COPY_SIGN_AND_EXP((B_TYPE)(input), half, y); \ 367 GET_SQRT_TABLE_INDEX(exp,index); \ 368 b = (B_TYPE)D_SQRT_TABLE_NAME[index].b; \ 369 b *= y; \ 370 c = (B_TYPE)D_SQRT_TABLE_NAME[index].c; \ 371 c += b; \ 372 y *= y; \ 373 a = (B_TYPE)D_SQRT_TABLE_NAME[index].a; \ 374 y *= a; \ 375 y += c; \ 376 F_SQRT_2_LSB_NO_SCALE_FINISH_ITERATION(input); \ 377 (result) = (F_TYPE)y; \ 378 } 379 380 #endif /* SQRT_MACROS_H */ 381 382 383