1 /****************************************************************************** 2 Copyright (c) 2007-2011, Intel Corp. 3 All rights reserved. 4 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 this list of conditions and the following disclaimer. 10 * Redistributions in binary form must reproduce the above copyright 11 notice, this list of conditions and the following disclaimer in the 12 documentation and/or other materials provided with the distribution. 13 * Neither the name of Intel Corporation nor the names of its contributors 14 may be used to endorse or promote products derived from this software 15 without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 27 THE POSSIBILITY OF SUCH DAMAGE. 28 ******************************************************************************/ 29 30 #include <float.h> 31 32 33 #if (BITS_PER_WORD != 64) 34 # define EXT_MULH_32(i,j,hi) EXT_MULH((i),(j),(hi)) 35 #else 36 # define EXT_MULH_32(i,j,hi) (hi) = (INT_32) ((((WORD) (i)) * ((WORD) (j))) >> 32) 37 #endif 38 39 #define ARITH_SHIFT_WORD_RIGHT(i,j) (i) = (WORD)(i) >> (j); 40 41 42 43 #if (COMPILER == msc_cc) 44 # if BITS_PER_WORD == 32 45 46 # define EXT_UMUL(i,j,lo,hi) { \ 47 int tmp_i = (i); \ 48 int tmp_j = (j); \ 49 int tmp_lo, tmp_hi; \ 50 { \ 51 __asm mov eax, tmp_i \ 52 __asm mul tmp_j \ 53 __asm mov tmp_lo, eax \ 54 __asm mov tmp_hi, edx \ 55 } \ 56 (lo) = tmp_lo; \ 57 (hi) = tmp_hi; \ 58 } 59 60 61 # define EXT_UMULH(i,j,hi) { \ 62 int tmp_i = (i); \ 63 int tmp_j = (j); \ 64 int tmp_hi; \ 65 { \ 66 __asm mov eax, tmp_i \ 67 __asm mul tmp_j \ 68 __asm mov tmp_hi, edx \ 69 } \ 70 (hi) = tmp_hi; \ 71 } 72 73 74 # define EXT_MULH(i,j,hi) { \ 75 int tmp_i = (i); \ 76 int tmp_j = (j); \ 77 int tmp_hi; \ 78 { \ 79 __asm mov eax, tmp_i \ 80 __asm imul tmp_j \ 81 __asm mov tmp_hi, edx \ 82 } \ 83 (hi) = tmp_hi; \ 84 } 85 86 87 # define F_RINT_TO_FLOATING_AND_WORD_PRECISION_LIMIT (BITS_PER_WORD - 1) 88 # define F_RINT_TO_FLOATING_AND_WORD(x, flt_int_x, int_x) { \ 89 { \ 90 __asm fld x \ 91 __asm frndint \ 92 __asm fstp flt_int_x \ 93 } \ 94 int_x = (WORD)flt_int_x; \ 95 } 96 97 # define B_RINT_TO_FLOATING_AND_WORD_PRECISION_LIMIT (BITS_PER_WORD - 1) 98 # define B_RINT_TO_FLOATING_AND_WORD(x, flt_int_x, int_x) \ 99 F_RINT_TO_FLOATING_AND_WORD(x, flt_int_x, int_x) 100 101 #endif 102 103 #define FPU_STATUS_WORD_TYPE unsigned short 104 105 106 #if (USE_CONTROL87) 107 108 #define INIT_FPU_STATE_AND_ROUND_TO_NEAREST(status_word) { \ 109 status_word = _control87(0,0); \ 110 _control87(MCW_RC, _RC_NEAR); \ 111 } 112 113 #define INIT_FPU_STATE_AND_ROUND_TO_ZERO(status_word) { \ 114 status_word = _control87(0,0); \ 115 _control87(RC_CHOP, MCW_RC); \ 116 } 117 118 #define RESTORE_FPU_STATE(status_word) { \ 119 _control87((INT_16)status_word, 0xffff); \ 120 } 121 122 #else 123 124 #define INIT_FPU_STATE_AND_ROUND_TO_NEAREST(status_word) { \ 125 FPU_STATUS_WORD_TYPE tmp = 0x037f; \ 126 { \ 127 __asm fstcw status_word \ 128 __asm fldcw tmp \ 129 } \ 130 } 131 132 #define INIT_FPU_STATE_AND_ROUND_TO_ZERO(status_word) { \ 133 FPU_STATUS_WORD_TYPE tmp = 0x0f7f; \ 134 { \ 135 __asm fstcw status_word \ 136 __asm fldcw tmp \ 137 } \ 138 } 139 140 #define RESTORE_FPU_STATE(status_word) { \ 141 FPU_STATUS_WORD_TYPE tmp = (FPU_STATUS_WORD_TYPE)(status_word); \ 142 { \ 143 __asm fldcw tmp \ 144 } \ 145 } 146 147 #endif 148 149 150 151 /* The following several macros are intended to be used as a set. It is 152 the combination of F_SAVE_SIGN_AND_GET_ABS and F_RESTORE_SIGN (or 153 F_NEGATE_IF_SIGN_NEG) that should be efficient (i.e. if slowing one of them 154 down will make the combination faster, go ahead and do it. */ 155 156 #ifndef F_SIGN_TYPE 157 158 # define F_SIGN_TYPE U_WORD 159 160 # define F_SAVE_SIGN_AND_GET_ABS(x, sign, abs_x) { \ 161 F_UNION u; \ 162 u.f = (x); \ 163 F_ABS((x), (abs_x)); \ 164 (sign) = u.F_HI_WORD; \ 165 } 166 167 # define F_CHANGE_SIGN(sign) \ 168 (sign) = ~(sign) 169 170 # define F_RESTORE_SIGN(sign, x) \ 171 ASSERT((x) >= 0.0); \ 172 if ((WORD)sign < 0) F_NEGATE(x); 173 174 # define F_NEGATE_IF_SIGN_NEG(sign, x) \ 175 if ((WORD)sign < 0) F_NEGATE(x); 176 177 #endif 178 179 180 181 #elif (COMPILER == gnu_cc) 182 183 184 185 #define __ABS(x,abs_x) { \ 186 abs_x = x; \ 187 __asm__ __volatile__ ("fabs" :"=t" (abs_x) : "0" (abs_x)); \ 188 } 189 190 #define __CLEAR_NEG_BIT(x) { \ 191 __asm__ __volatile__ ("fabs" :"=t" (x) : "0" (x)); \ 192 } 193 194 #define __NEGATE(x) { \ 195 __asm__ __volatile__ ("fchs" :"=t" (x) : "0" (x)); \ 196 } 197 198 #define __SET_NEG_BIT(x) { \ 199 __asm__ __volatile__ ("fabs" :"=t" (x) : "0" (x)); \ 200 __asm__ __volatile__ ("fchs" :"=t" (x) : "0" (x)); \ 201 } 202 203 #define __RINT_TO_FLOATING_AND_WORD_PRECISION_LIMIT (BITS_PER_WORD - 1) 204 #define __RINT_TO_FLOATING_AND_WORD(x, f_int_x, int_x) { \ 205 f_int_x = x; \ 206 __asm__ __volatile__ ("frndint" :"=t" (f_int_x) : "0" (f_int_x)); \ 207 int_x = (WORD) f_int_x; \ 208 } 209 210 211 #define F_ABS(x,abs_x) __ABS(x,abs_x) 212 #define F_CLEAR_NEG_BIT(x) __CLEAR_NEG_BIT(x) 213 #define F_NEGATE(x) __NEGATE(x) 214 #define F_SET_NEG_BIT(x) __SET_NEG_BIT(x) 215 216 #define F_RINT_TO_FLOATING_AND_WORD_PRECISION_LIMIT \ 217 __RINT_TO_FLOATING_AND_WORD_PRECISION_LIMIT 218 #define F_RINT_TO_FLOATING_AND_WORD(x, flt_int_x, int_x) \ 219 __RINT_TO_FLOATING_AND_WORD((x), (flt_int_x), (int_x)) 220 221 #if (F_FORMAT == s_floating) 222 223 #define B_ABS(x,abs_x) __ABS(x,abs_x) 224 #define B_CLEAR_NEG_BIT(x) __CLEAR_NEG_BIT(x) 225 #define B_NEGATE(x) __NEGATE(x) 226 #define B_SET_NEG_BIT(x) __SET_NEG_BIT(x) 227 228 #define B_RINT_TO_FLOATING_AND_WORD_PRECISION_LIMIT \ 229 __RINT_TO_FLOATING_AND_WORD_PRECISION_LIMIT 230 #define B_RINT_TO_FLOATING_AND_WORD(x, flt_int_x, int_x) \ 231 __RINT_TO_FLOATING_AND_WORD((x), (flt_int_x), (int_x)) 232 233 #endif /* F_FORMAT */ 234 235 236 #define INIT_FPU_STATE_AND_ROUND_TO_NEAREST(status_word) { \ 237 volatile unsigned short tmp; \ 238 __asm__ volatile ("fstcw %0" : "=m" (tmp) : ); \ 239 status_word = tmp; \ 240 tmp &= 0xf2ff; \ 241 __asm__ volatile ("fldcw %0" : : "m" (tmp)); \ 242 } 243 244 #define INIT_FPU_STATE_AND_ROUND_TO_ZERO(status_word) { \ 245 volatile unsigned short tmp; \ 246 __asm__ volatile ("fstcw %0" : "=m" (tmp) : ); \ 247 status_word = tmp; \ 248 tmp &= 0xf2ff; \ 249 tmp |= 0x0c00; \ 250 __asm__ volatile ("fldcw %0" : : "m" (tmp)); \ 251 } 252 253 #define RESTORE_FPU_STATE(status_word) { \ 254 volatile unsigned short tmp = status_word; \ 255 __asm__ volatile ("fldcw %0" : : "m" (tmp)); \ 256 } 257 258 259 /* The following several macros are intended to be used as a set. It is 260 the combination of F_SAVE_SIGN_AND_GET_ABS and F_RESTORE_SIGN (or 261 F_NEGATE_IF_SIGN_NEG) that should be efficient (i.e. if slowing one of them 262 down will make the combination faster, go ahead and do it. */ 263 264 #ifndef F_SIGN_TYPE 265 266 # define F_SIGN_TYPE F_TYPE 267 268 # define F_SAVE_SIGN_AND_GET_ABS(x, sign, abs_x) \ 269 (sign) = x; \ 270 F_ABS((x), (abs_x)) 271 272 # define F_CHANGE_SIGN(sign) \ 273 F_NEGATE(sign) 274 275 # define F_RESTORE_SIGN(sign, x) \ 276 ASSERT((x) >= 0.0); \ 277 if ((sign) < 0.0) F_NEGATE(x) 278 279 # define F_NEGATE_IF_SIGN_NEG(sign, x) \ 280 if ((sign) < 0.0) F_NEGATE(x) 281 282 #endif 283 284 285 286 #endif /* COMPILER */ 287 288 289 290 291 #define F_ADD_ROUNDED(x,y,z) \ 292 { volatile F_TYPE vv; \ 293 vv = (F_TYPE) (x) + (y); \ 294 z = vv; } 295 296 #define B_ADD_ROUNDED(x,y,z) \ 297 { volatile B_TYPE vv; \ 298 vv = (B_TYPE) (x) + (y); \ 299 z = vv; } 300 301 #define F_ADD_CHOPPED(x,y,z) \ 302 { volatile F_TYPE vv; \ 303 vv = (F_TYPE) (x) + (y); \ 304 z = vv; } 305 306 #define B_ADD_CHOPPED(x,y,z) \ 307 { volatile B_TYPE vv; \ 308 vv = (B_TYPE) (x) + (y); \ 309 z = vv; } 310 311 #define F_MUL_CHOPPED(x,y,z) \ 312 { volatile F_TYPE vv; \ 313 vv = (F_TYPE) (x) * (y); \ 314 z = vv; } 315 316 #define ADD_SUB_BIG(z, big) \ 317 { volatile F_TYPE vv; \ 318 vv = z + big; \ 319 z = vv - big; } 320 321 #define ADD_SUB_BIG_CHOPPED(z, big) \ 322 { volatile F_TYPE vv; \ 323 ADD(z, big, vv); \ 324 z = vv - big; } 325 326 #define SHORTEN_VIA_CASTS(in, out) \ 327 { volatile S_TYPE vv; \ 328 vv = (S_TYPE) in; \ 329 out = (D_TYPE) vv; } 330 331 332 #define ASSIGN_WITH_F_TYPE_PRECISION(y, truncated_y) \ 333 { volatile F_TYPE vv; \ 334 vv = (F_TYPE) y; \ 335 truncated_y = vv; } 336 337 #define CHOP_TO_S_TYPE(y, truncated_y) \ 338 { volatile F_TYPE vv; \ 339 vv = (F_TYPE) y; \ 340 truncated_y = vv; } 341 342 343 344 345 346 /* The macro X_SQR_TO_HI_LO is used to produce high and low parts of x^2; */ 347 /* see the comments in DPML_ERF.C for details. The macros are defined */ 348 /* here specifically for the Intel platform to avoid a problem with the */ 349 /* CL386 compiler. Given the sequence */ 350 /* */ 351 /* a = ( B_TYPE ) ( ( F_TYPE ) b ) */ 352 /* */ 353 /* where a and b are of type B_TYPE, the compiler simple moves b into a */ 354 /* rather than first shortening it and then lengthening it. These macros */ 355 /* have been hand-crafted to work around this problem. */ 356 357 #if PRECISION_BACKUP_AVAILABLE 358 # define X_SQR_TO_HI_LO(x, t, hi, lo) { \ 359 volatile B_TYPE tv ; \ 360 volatile F_TYPE hiv, lov ; \ 361 tv = ( B_TYPE ) x ; \ 362 tv = tv * tv ; \ 363 hiv = ( F_TYPE ) tv ; \ 364 lov = ( F_TYPE ) ( tv - ( B_TYPE ) hiv ) ; \ 365 t = tv ; \ 366 hi = hiv ; \ 367 lo = lov ; \ 368 } 369 #else 370 # define X_SQR_TO_HI_LO(x, t, hi, lo) { \ 371 volatile F_TYPE hiv, lov ; \ 372 volatile R_TYPE xv ; \ 373 xv = ( R_TYPE ) x ; \ 374 hiv = ( F_TYPE ) xv ; \ 375 lov = x - hiv ; \ 376 lov = lov * ( hiv + x ) ; \ 377 hiv = hiv * hiv ; \ 378 hi = hiv ; \ 379 lo = lov ; \ 380 } 381 #endif 382 383 384 385 386 387 /* The following macros support extended precision multiplication of a */ 388 /* sequence of unsigned WORDs. The basic operation is an extended integer */ 389 /* multiply and add with four inputs and three results. The inputs are an */ 390 /* addend, in high and low parts (w_hi, w_lo), the carry in from a */ 391 /* previous operation, c_in, and the multiplier and multiplicand F and g. */ 392 /* The three outputs are the carry out, c_out, and the high and low digits */ 393 /* of the sum, z_hi and z_lo. The basic operation is */ 394 /* */ 395 /* F * g + w_lo + w_hi * BITS_PER_WORD + c_in -> */ 396 /* z_lo + z_hi * BITS_PER_WORD + c_out * BITS_PER_WORD^2 */ 397 /* */ 398 /* There are six different macros: one for the basic operation and five */ 399 /* special cases (e.g. to ignore the carry out or when the carry in is */ 400 /* zero). */ 401 /* */ 402 403 #define BITS_PER_DIGIT 64 404 405 #define __LO(x) ((x) & MAKE_MASK( BITS_PER_DIGIT / 2, 0 )) 406 #define __HI(x) (((x)) >> ( BITS_PER_DIGIT / 2 )) 407 408 409 #define UMULH(i, j, k) { \ 410 U_INT_64 _ii, _iLo, _iHi, _jj, _jLo, _jHi, _p0, _p1, _p2; \ 411 _ii = i; _iLo = __LO(_ii); _iHi = __HI(_ii); \ 412 _jj = j; _jLo = __LO(_jj); _jHi = __HI(_jj); \ 413 _p0 = _iLo * _jLo; \ 414 _p1 = (_iLo * _jHi); \ 415 _p2 = (_iHi * _jLo) + __HI(_p0) + __LO(_p1); \ 416 k = (_iHi * _jHi) + __HI(_p1) + __HI(_p2); \ 417 } 418 419 420 /* a * b + add_low -> low */ 421 #define MUL_ADD( a, b, add_low, low ) { \ 422 ( low ) = ( a ) * ( b ) + ( add_low ) ; \ 423 } 424 425 /* a * b -> low + high * 2^BITS_PER_WORD */ 426 #define XMUL( a, b, high, low ) { \ 427 U_WORD p0, p1, p2, p3, s ; \ 428 p0 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 429 p1 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \ 430 p2 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 431 p3 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \ 432 s = ( p0 >> ( BITS_PER_DIGIT / 2 ) ) + ( p1 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \ 433 ( p2 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 434 ( low ) = ( p0 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( s << ( BITS_PER_DIGIT / 2 ) ) ; \ 435 ( high ) = ( p1 >> ( BITS_PER_DIGIT / 2 ) ) + ( p2 >> ( BITS_PER_DIGIT / 2 ) ) + p3 + \ 436 ( s >> ( BITS_PER_DIGIT / 2 ) ) ; \ 437 } 438 439 /* a * b + add_low + add_high * 2^BITS_PER_WORD -> low + */ 440 /* high * 2^BITS_PER_WORD */ 441 #define XMUL_ADD( a, b, add_low, high, low ) { \ 442 U_WORD p0, p1, p2, p3, s, t ; \ 443 p0 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 444 p1 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \ 445 p2 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 446 p3 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \ 447 s = ( p0 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 448 t = ( p0 >> ( BITS_PER_DIGIT / 2 ) ) + ( p1 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \ 449 ( p2 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) >> ( BITS_PER_DIGIT / 2 ) ) + \ 450 ( s >> ( BITS_PER_DIGIT / 2 ) ) ; \ 451 ( low ) = ( s & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( t << ( BITS_PER_DIGIT / 2 ) ) ; \ 452 ( high ) = ( p1 >> ( BITS_PER_DIGIT / 2 ) ) + ( p2 >> ( BITS_PER_DIGIT / 2 ) ) + p3 + \ 453 ( t >> ( BITS_PER_DIGIT / 2 ) ) ; \ 454 } 455 456 /* a * b + add_low + add_high * 2^BITS_PER_WORD -> low + */ 457 /* high * 2^BITS_PER_WORD */ 458 #define XMUL_XADD( a, b, add_high, add_low, high, low ) { \ 459 U_WORD p0, p1, p2, p3, s, t ; \ 460 p0 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 461 p1 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \ 462 p2 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 463 p3 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \ 464 s = ( p0 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 465 t = ( p0 >> ( BITS_PER_DIGIT / 2 ) ) + ( p1 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \ 466 ( p2 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) >> ( BITS_PER_DIGIT / 2 ) ) + \ 467 ( s >> ( BITS_PER_DIGIT / 2 ) ) ; \ 468 ( low ) = ( s & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( t << ( BITS_PER_DIGIT / 2 ) ) ; \ 469 ( high ) = ( p1 >> ( BITS_PER_DIGIT / 2 ) ) + ( p2 >> ( BITS_PER_DIGIT / 2 ) ) + p3 + ( add_high ) + \ 470 ( t >> ( BITS_PER_DIGIT / 2 ) ) ; \ 471 } 472 473 /* a * b + add_low + add_high * 2^BITS_PER_WORD -> */ 474 /* low + */ 475 /* high * 2^BITS_PER_WORD + */ 476 /* carry_out * 2^(2 * BITS_PER_WORD) */ 477 #define XMUL_XADDC( a, b, add_high, add_low, carry_out, high, low ) { \ 478 U_WORD p0, p1, p2, p3, s, t, u, v ; \ 479 p0 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 480 p1 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \ 481 p2 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 482 p3 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \ 483 s = ( p0 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 484 t = ( p0 >> ( BITS_PER_DIGIT / 2 ) ) + ( p1 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \ 485 ( p2 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) >> ( BITS_PER_DIGIT / 2 ) ) + \ 486 ( s >> ( BITS_PER_DIGIT / 2 ) ) ; \ 487 u = ( p1 >> ( BITS_PER_DIGIT / 2 ) ) + ( p2 >> ( BITS_PER_DIGIT / 2 ) ) + \ 488 ( p3 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_high ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \ 489 ( t >> ( BITS_PER_DIGIT / 2 ) ) ; \ 490 v = ( p3 >> ( BITS_PER_DIGIT / 2 ) ) + ( ( add_high ) >> ( BITS_PER_DIGIT / 2 ) ) + \ 491 ( u >> ( BITS_PER_DIGIT / 2 ) ) ; \ 492 ( low ) = ( s & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( t << ( BITS_PER_DIGIT / 2 ) ) ; \ 493 ( high ) = ( u & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( v << ( BITS_PER_DIGIT / 2 ) ) ; \ 494 ( carry_out ) = v >> ( BITS_PER_DIGIT / 2 ) ; \ 495 } 496 497 /* a * b + add_low + add_high * 2^BITS_PER_WORD + carry_in -> */ 498 /* low + */ 499 /* high * 2^BITS_PER_WORD */ 500 #define XMUL_XADD_W_C_IN( a, b, add_high, add_low, carry_in, high, low ) { \ 501 U_WORD p0, p1, p2, p3, s, t ; \ 502 p0 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 503 p1 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \ 504 p2 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 505 p3 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \ 506 s = ( p0 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \ 507 ( ( carry_in ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 508 t = ( p0 >> ( BITS_PER_DIGIT / 2 ) ) + ( p1 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \ 509 ( p2 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) >> ( BITS_PER_DIGIT / 2 ) ) + \ 510 ( ( carry_in ) >> ( BITS_PER_DIGIT / 2 ) ) + ( s >> ( BITS_PER_DIGIT / 2 ) ) ; \ 511 ( low ) = ( s & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( t << ( BITS_PER_DIGIT / 2 ) ) ; \ 512 ( high ) = ( p1 >> ( BITS_PER_DIGIT / 2 ) ) + ( p2 >> ( BITS_PER_DIGIT / 2 ) ) + p3 + ( add_high ) + \ 513 ( t >> ( BITS_PER_DIGIT / 2 ) ) ; \ 514 } 515 516 /* a * b + add_low + add_high * 2^BITS_PER_WORD + carry_in -> */ 517 /* low + */ 518 /* high * 2^BITS_PER_WORD + */ 519 /* carry_out * 2^(2 * BITS_PER_WORD) */ 520 #define XMUL_XADDC_W_C_IN( a, b, add_high, add_low, carry_in, carry_out, high, low ) { \ 521 U_WORD p0, p1, p2, p3, s, t, u, v ; \ 522 p0 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 523 p1 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \ 524 p2 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 525 p3 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \ 526 s = ( p0 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \ 527 ( ( carry_in ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \ 528 t = ( p0 >> ( BITS_PER_DIGIT / 2 ) ) + ( p1 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \ 529 ( p2 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) >> ( BITS_PER_DIGIT / 2 ) ) + \ 530 ( ( carry_in ) >> ( BITS_PER_DIGIT / 2 ) ) + ( s >> ( BITS_PER_DIGIT / 2 ) ) ; \ 531 u = ( p1 >> ( BITS_PER_DIGIT / 2 ) ) + ( p2 >> ( BITS_PER_DIGIT / 2 ) ) + \ 532 ( p3 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_high ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \ 533 ( t >> ( BITS_PER_DIGIT / 2 ) ) ; \ 534 v = ( p3 >> ( BITS_PER_DIGIT / 2 ) ) + ( ( add_high ) >> ( BITS_PER_DIGIT / 2 ) ) + \ 535 ( u >> ( BITS_PER_DIGIT / 2 ) ) ; \ 536 ( low ) = ( s & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( t << ( BITS_PER_DIGIT / 2 ) ) ; \ 537 ( high ) = ( u & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( v << ( BITS_PER_DIGIT / 2 ) ) ; \ 538 ( carry_out )= v >> ( BITS_PER_DIGIT / 2 ) ; \ 539 } 540 541 542