1 /******************************************************************************
2   Copyright (c) 2007-2011, Intel Corp.
3   All rights reserved.
4 
5   Redistribution and use in source and binary forms, with or without
6   modification, are permitted provided that the following conditions are met:
7 
8     * Redistributions of source code must retain the above copyright notice,
9       this list of conditions and the following disclaimer.
10     * Redistributions in binary form must reproduce the above copyright
11       notice, this list of conditions and the following disclaimer in the
12       documentation and/or other materials provided with the distribution.
13     * Neither the name of Intel Corporation nor the names of its contributors
14       may be used to endorse or promote products derived from this software
15       without specific prior written permission.
16 
17   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
27   THE POSSIBILITY OF SUCH DAMAGE.
28 ******************************************************************************/
29 
30 #include <float.h>
31 
32 
33 #if (BITS_PER_WORD != 64)
34 #   define EXT_MULH_32(i,j,hi) EXT_MULH((i),(j),(hi))
35 #else
36 #   define EXT_MULH_32(i,j,hi) (hi) =  (INT_32) ((((WORD) (i)) * ((WORD) (j))) >> 32)
37 #endif
38 
39 #define ARITH_SHIFT_WORD_RIGHT(i,j) (i) = (WORD)(i) >> (j);
40 
41 
42 
43 #if (COMPILER == msc_cc)
44 #  if BITS_PER_WORD == 32
45 
46 #    define EXT_UMUL(i,j,lo,hi) { \
47 	int tmp_i = (i); \
48 	int tmp_j = (j); \
49 	int tmp_lo, tmp_hi; \
50 	{ \
51 		__asm mov eax, tmp_i \
52 		__asm mul tmp_j \
53 		__asm mov tmp_lo, eax \
54 		__asm mov tmp_hi, edx \
55 	} \
56 	(lo) = tmp_lo; \
57 	(hi) = tmp_hi; \
58 }
59 
60 
61 #    define EXT_UMULH(i,j,hi) { \
62 	int tmp_i = (i); \
63 	int tmp_j = (j); \
64 	int tmp_hi; \
65 	{ \
66 		__asm mov eax, tmp_i \
67 		__asm mul tmp_j \
68 		__asm mov tmp_hi, edx \
69 	} \
70 	(hi) = tmp_hi; \
71 }
72 
73 
74 #    define EXT_MULH(i,j,hi) { \
75 	int tmp_i = (i); \
76 	int tmp_j = (j); \
77 	int tmp_hi; \
78 	{ \
79 		__asm mov eax, tmp_i \
80 		__asm imul tmp_j \
81 		__asm mov tmp_hi, edx \
82 	} \
83 	(hi) = tmp_hi; \
84 }
85 
86 
87 #    define F_RINT_TO_FLOATING_AND_WORD_PRECISION_LIMIT (BITS_PER_WORD - 1)
88 #    define F_RINT_TO_FLOATING_AND_WORD(x, flt_int_x, int_x) { \
89 	{ \
90 		__asm fld x \
91 		__asm frndint \
92 		__asm fstp flt_int_x \
93 	} \
94 	int_x = (WORD)flt_int_x; \
95 }
96 
97 #    define B_RINT_TO_FLOATING_AND_WORD_PRECISION_LIMIT (BITS_PER_WORD - 1)
98 #    define B_RINT_TO_FLOATING_AND_WORD(x, flt_int_x, int_x) \
99 	F_RINT_TO_FLOATING_AND_WORD(x, flt_int_x, int_x)
100 
101 #endif
102 
103 #define FPU_STATUS_WORD_TYPE unsigned short
104 
105 
106 #if (USE_CONTROL87)
107 
108 #define INIT_FPU_STATE_AND_ROUND_TO_NEAREST(status_word) { \
109 	status_word = _control87(0,0); \
110 	_control87(MCW_RC, _RC_NEAR); \
111 }
112 
113 #define INIT_FPU_STATE_AND_ROUND_TO_ZERO(status_word) { \
114 	status_word = _control87(0,0); \
115 	_control87(RC_CHOP, MCW_RC); \
116 }
117 
118 #define RESTORE_FPU_STATE(status_word) { \
119 	_control87((INT_16)status_word, 0xffff); \
120 }
121 
122 #else
123 
124 #define INIT_FPU_STATE_AND_ROUND_TO_NEAREST(status_word) { \
125 	FPU_STATUS_WORD_TYPE tmp = 0x037f; \
126 	{ \
127 		__asm fstcw status_word \
128 		__asm fldcw tmp \
129 	} \
130 }
131 
132 #define INIT_FPU_STATE_AND_ROUND_TO_ZERO(status_word) { \
133 	FPU_STATUS_WORD_TYPE tmp = 0x0f7f; \
134 	{ \
135 		__asm fstcw status_word \
136 		__asm fldcw tmp \
137 	} \
138 }
139 
140 #define RESTORE_FPU_STATE(status_word) { \
141 	FPU_STATUS_WORD_TYPE tmp = (FPU_STATUS_WORD_TYPE)(status_word); \
142 	{ \
143 		__asm fldcw tmp \
144 	} \
145 }
146 
147 #endif
148 
149 
150 
151 /* The following several macros are intended to be used as a set.  It is
152 the combination of F_SAVE_SIGN_AND_GET_ABS and F_RESTORE_SIGN (or
153 F_NEGATE_IF_SIGN_NEG) that should be efficient (i.e. if slowing one of them
154 down will make the combination faster, go ahead and do it.  */
155 
156 #ifndef F_SIGN_TYPE
157 
158 #	define F_SIGN_TYPE U_WORD
159 
160 #	define F_SAVE_SIGN_AND_GET_ABS(x, sign, abs_x) { \
161 		F_UNION u; \
162 		u.f = (x); \
163 		F_ABS((x), (abs_x)); \
164 		(sign) = u.F_HI_WORD; \
165 	}
166 
167 #	define F_CHANGE_SIGN(sign) \
168 		(sign) = ~(sign)
169 
170 #	define F_RESTORE_SIGN(sign, x) \
171 		ASSERT((x) >= 0.0); \
172 		if ((WORD)sign < 0) F_NEGATE(x);
173 
174 #	define F_NEGATE_IF_SIGN_NEG(sign, x) \
175 		if ((WORD)sign < 0) F_NEGATE(x);
176 
177 #endif
178 
179 
180 
181 #elif (COMPILER == gnu_cc)
182 
183 
184 
185 #define __ABS(x,abs_x) { \
186 	abs_x = x; \
187 	__asm__ __volatile__ ("fabs" :"=t" (abs_x) : "0" (abs_x)); \
188 }
189 
190 #define __CLEAR_NEG_BIT(x) { \
191 	__asm__ __volatile__ ("fabs" :"=t" (x) : "0" (x)); \
192 }
193 
194 #define __NEGATE(x) { \
195 	__asm__ __volatile__ ("fchs" :"=t" (x) : "0" (x)); \
196 }
197 
198 #define __SET_NEG_BIT(x) { \
199 	__asm__ __volatile__ ("fabs" :"=t" (x) : "0" (x)); \
200 	__asm__ __volatile__ ("fchs" :"=t" (x) : "0" (x)); \
201 }
202 
203 #define __RINT_TO_FLOATING_AND_WORD_PRECISION_LIMIT (BITS_PER_WORD - 1)
204 #define __RINT_TO_FLOATING_AND_WORD(x, f_int_x, int_x) { \
205 	f_int_x = x; \
206 	__asm__ __volatile__ ("frndint" :"=t" (f_int_x) : "0" (f_int_x)); \
207 	int_x = (WORD) f_int_x;  \
208 }
209 
210 
211 #define F_ABS(x,abs_x) __ABS(x,abs_x)
212 #define F_CLEAR_NEG_BIT(x) __CLEAR_NEG_BIT(x)
213 #define F_NEGATE(x) __NEGATE(x)
214 #define F_SET_NEG_BIT(x) __SET_NEG_BIT(x)
215 
216 #define F_RINT_TO_FLOATING_AND_WORD_PRECISION_LIMIT \
217 	__RINT_TO_FLOATING_AND_WORD_PRECISION_LIMIT
218 #define F_RINT_TO_FLOATING_AND_WORD(x, flt_int_x, int_x) \
219 	__RINT_TO_FLOATING_AND_WORD((x), (flt_int_x), (int_x))
220 
221 #if (F_FORMAT == s_floating)
222 
223 #define B_ABS(x,abs_x) __ABS(x,abs_x)
224 #define B_CLEAR_NEG_BIT(x) __CLEAR_NEG_BIT(x)
225 #define B_NEGATE(x) __NEGATE(x)
226 #define B_SET_NEG_BIT(x) __SET_NEG_BIT(x)
227 
228 #define B_RINT_TO_FLOATING_AND_WORD_PRECISION_LIMIT \
229 	__RINT_TO_FLOATING_AND_WORD_PRECISION_LIMIT
230 #define B_RINT_TO_FLOATING_AND_WORD(x, flt_int_x, int_x) \
231 	__RINT_TO_FLOATING_AND_WORD((x), (flt_int_x), (int_x))
232 
233 #endif  /* F_FORMAT */
234 
235 
236 #define INIT_FPU_STATE_AND_ROUND_TO_NEAREST(status_word) { \
237 	volatile unsigned short tmp; \
238 	__asm__ volatile ("fstcw %0" : "=m" (tmp) : ); \
239 	status_word = tmp; \
240 	tmp &= 0xf2ff; \
241 	__asm__ volatile ("fldcw %0" : : "m" (tmp)); \
242 }
243 
244 #define INIT_FPU_STATE_AND_ROUND_TO_ZERO(status_word) { \
245 	volatile unsigned short tmp; \
246 	__asm__ volatile ("fstcw %0" : "=m" (tmp) : ); \
247 	status_word = tmp; \
248 	tmp &= 0xf2ff; \
249 	tmp |= 0x0c00; \
250 	__asm__ volatile ("fldcw %0" : : "m" (tmp)); \
251 }
252 
253 #define RESTORE_FPU_STATE(status_word) { \
254 	volatile unsigned short tmp = status_word; \
255 	__asm__ volatile ("fldcw %0" : : "m" (tmp)); \
256 }
257 
258 
259 /* The following several macros are intended to be used as a set.  It is
260 the combination of F_SAVE_SIGN_AND_GET_ABS and F_RESTORE_SIGN (or
261 F_NEGATE_IF_SIGN_NEG) that should be efficient (i.e. if slowing one of them
262 down will make the combination faster, go ahead and do it.  */
263 
264 #ifndef F_SIGN_TYPE
265 
266 #    define	F_SIGN_TYPE F_TYPE
267 
268 #    define	F_SAVE_SIGN_AND_GET_ABS(x, sign, abs_x) \
269 		(sign) = x; \
270 		F_ABS((x), (abs_x))
271 
272 #    define	F_CHANGE_SIGN(sign) \
273 		F_NEGATE(sign)
274 
275 #    define	F_RESTORE_SIGN(sign, x) \
276 		ASSERT((x) >= 0.0); \
277 		if ((sign) < 0.0) F_NEGATE(x)
278 
279 #    define	F_NEGATE_IF_SIGN_NEG(sign, x) \
280 		if ((sign) < 0.0) F_NEGATE(x)
281 
282 #endif
283 
284 
285 
286 #endif  /* COMPILER */
287 
288 
289 
290 
291 #define F_ADD_ROUNDED(x,y,z) \
292 {   volatile F_TYPE vv; \
293     vv = (F_TYPE) (x) + (y); \
294     z = vv; }
295 
296 #define B_ADD_ROUNDED(x,y,z) \
297 {   volatile B_TYPE vv; \
298     vv = (B_TYPE) (x) + (y); \
299     z = vv; }
300 
301 #define F_ADD_CHOPPED(x,y,z) \
302 {   volatile F_TYPE vv; \
303     vv = (F_TYPE) (x) + (y); \
304     z = vv; }
305 
306 #define B_ADD_CHOPPED(x,y,z) \
307 {   volatile B_TYPE vv; \
308     vv = (B_TYPE) (x) + (y); \
309     z = vv; }
310 
311 #define F_MUL_CHOPPED(x,y,z) \
312 {   volatile F_TYPE vv; \
313     vv = (F_TYPE) (x) * (y); \
314     z = vv; }
315 
316 #define ADD_SUB_BIG(z, big) \
317 { volatile F_TYPE vv; \
318     vv = z + big; \
319     z = vv - big; }
320 
321 #define ADD_SUB_BIG_CHOPPED(z, big) \
322 { volatile F_TYPE vv; \
323     ADD(z, big, vv); \
324     z = vv - big; }
325 
326 #define SHORTEN_VIA_CASTS(in, out) \
327 { volatile S_TYPE vv; \
328     vv = (S_TYPE) in; \
329     out = (D_TYPE) vv; }
330 
331 
332 #define ASSIGN_WITH_F_TYPE_PRECISION(y, truncated_y) \
333 { volatile F_TYPE vv; \
334     vv = (F_TYPE) y; \
335     truncated_y = vv; }
336 
337 #define CHOP_TO_S_TYPE(y, truncated_y) \
338 { volatile F_TYPE vv; \
339     vv = (F_TYPE) y; \
340     truncated_y = vv; }
341 
342 
343 
344 
345 
346 /*  The macro X_SQR_TO_HI_LO is used to produce high and low parts of x^2;  */
347 /*  see the comments in DPML_ERF.C for details.  The macros are defined	    */
348 /*  here specifically for the Intel platform to avoid a problem with the    */
349 /*  CL386 compiler.  Given the sequence					    */
350 /*									    */
351 /*		a = ( B_TYPE ) ( ( F_TYPE ) b )				    */
352 /*									    */
353 /*  where a and b are of type B_TYPE, the compiler simple moves b into a    */
354 /*  rather than first shortening it and then lengthening it.  These macros  */
355 /*  have been hand-crafted to work around this problem.			    */
356 
357 #if PRECISION_BACKUP_AVAILABLE
358 #   define X_SQR_TO_HI_LO(x, t, hi, lo) { \
359 	volatile B_TYPE tv ; \
360 	volatile F_TYPE hiv, lov ; \
361 	tv = ( B_TYPE ) x ; \
362 	tv = tv * tv ; \
363 	hiv = ( F_TYPE ) tv ; \
364 	lov = ( F_TYPE ) ( tv - ( B_TYPE ) hiv ) ; \
365 	t = tv ; \
366 	hi = hiv ; \
367 	lo = lov ; \
368 	}
369 #else
370 #   define X_SQR_TO_HI_LO(x, t, hi, lo) { \
371 	volatile F_TYPE hiv, lov ; \
372 	volatile R_TYPE xv ; \
373 	xv = ( R_TYPE ) x ; \
374 	hiv = ( F_TYPE ) xv ; \
375 	lov = x - hiv ; \
376 	lov = lov * ( hiv + x ) ; \
377 	hiv = hiv * hiv ; \
378 	hi = hiv ; \
379 	lo = lov ; \
380 	}
381 #endif
382 
383 
384 
385 
386 
387 /*  The following macros support extended precision multiplication of a	    */
388 /*  sequence of unsigned WORDs.  The basic operation is an extended integer */
389 /*  multiply and add with four inputs and three results.  The inputs are an */
390 /*  addend, in high and low parts (w_hi, w_lo), the carry in from a	    */
391 /*  previous operation, c_in, and the multiplier and multiplicand F and g.  */
392 /*  The three outputs are the carry out, c_out, and the high and low digits */
393 /*  of the sum, z_hi and z_lo.  The basic operation is			    */
394 /*									    */
395 /*	F * g + w_lo + w_hi * BITS_PER_WORD + c_in ->			    */
396 /*		    z_lo + z_hi * BITS_PER_WORD + c_out * BITS_PER_WORD^2   */
397 /*									    */
398 /*  There are six different macros:  one for the basic operation and five   */
399 /*  special cases (e.g. to ignore the carry out or when the carry in is	    */
400 /*  zero).								    */
401 /*									    */
402 
403 #define BITS_PER_DIGIT       64
404 
405 #define __LO(x)	((x) & MAKE_MASK( BITS_PER_DIGIT / 2, 0 ))
406 #define __HI(x)	(((x)) >> ( BITS_PER_DIGIT / 2 ))
407 
408 
409 #define UMULH(i, j, k) {						\
410 	U_INT_64 _ii, _iLo, _iHi, _jj, _jLo, _jHi, _p0, _p1, _p2; 	\
411         _ii = i; _iLo = __LO(_ii); _iHi = __HI(_ii);			\
412         _jj = j; _jLo = __LO(_jj); _jHi = __HI(_jj);			\
413 	_p0  = _iLo * _jLo;						\
414 	_p1  = (_iLo * _jHi);						\
415 	_p2  = (_iHi * _jLo) + __HI(_p0) + __LO(_p1);			\
416         k   = (_iHi * _jHi) + __HI(_p1) + __HI(_p2);			\
417 	}
418 
419 
420 /*  a * b + add_low -> low						    */
421 #define MUL_ADD( a, b, add_low, low ) { \
422     ( low ) = ( a ) * ( b ) + ( add_low ) ; \
423     }
424 
425 /*  a * b -> low + high * 2^BITS_PER_WORD				    */
426 #define XMUL( a, b, high, low ) { \
427     U_WORD p0, p1, p2, p3, s ; \
428     p0 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
429     p1 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \
430     p2 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
431     p3 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \
432     s = ( p0 >> ( BITS_PER_DIGIT / 2 ) ) + ( p1 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \
433 	( p2 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
434     ( low ) = ( p0 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( s << ( BITS_PER_DIGIT / 2 ) ) ; \
435     ( high ) = ( p1 >> ( BITS_PER_DIGIT / 2 ) ) + ( p2 >> ( BITS_PER_DIGIT / 2 ) ) + p3 + \
436 	( s >> ( BITS_PER_DIGIT / 2 ) ) ; \
437     }
438 
439 /*  a * b + add_low + add_high * 2^BITS_PER_WORD -> low +		    */
440 /*						    high * 2^BITS_PER_WORD  */
441 #define XMUL_ADD( a, b, add_low, high, low ) { \
442     U_WORD p0, p1, p2, p3, s, t ; \
443     p0 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
444     p1 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \
445     p2 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
446     p3 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \
447     s = ( p0 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
448     t = ( p0 >> ( BITS_PER_DIGIT / 2 ) ) + ( p1 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \
449         ( p2 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) >> ( BITS_PER_DIGIT / 2 ) ) + \
450 	( s >> ( BITS_PER_DIGIT / 2 ) ) ; \
451     ( low ) = ( s & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( t << ( BITS_PER_DIGIT / 2 ) ) ; \
452     ( high ) = ( p1 >> ( BITS_PER_DIGIT / 2 ) ) + ( p2 >> ( BITS_PER_DIGIT / 2 ) ) + p3 + \
453 	( t >> ( BITS_PER_DIGIT / 2 ) ) ; \
454     }
455 
456 /*  a * b + add_low + add_high * 2^BITS_PER_WORD -> low +		    */
457 /*						    high * 2^BITS_PER_WORD  */
458 #define XMUL_XADD( a, b, add_high, add_low, high, low ) { \
459     U_WORD p0, p1, p2, p3, s, t ; \
460     p0 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
461     p1 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \
462     p2 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
463     p3 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \
464     s = ( p0 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
465     t = ( p0 >> ( BITS_PER_DIGIT / 2 ) ) + ( p1 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \
466         ( p2 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) >> ( BITS_PER_DIGIT / 2 ) ) + \
467 	( s >> ( BITS_PER_DIGIT / 2 ) ) ; \
468     ( low ) = ( s & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( t << ( BITS_PER_DIGIT / 2 ) ) ; \
469     ( high ) = ( p1 >> ( BITS_PER_DIGIT / 2 ) ) + ( p2 >> ( BITS_PER_DIGIT / 2 ) ) + p3 + ( add_high ) + \
470 	( t >> ( BITS_PER_DIGIT / 2 ) ) ; \
471     }
472 
473 /*  a * b + add_low + add_high * 2^BITS_PER_WORD ->			    */
474 /*					low +				    */
475 /*					high * 2^BITS_PER_WORD +	    */
476 /*					carry_out * 2^(2 * BITS_PER_WORD)   */
477 #define XMUL_XADDC( a, b, add_high, add_low, carry_out, high, low ) { \
478     U_WORD p0, p1, p2, p3, s, t, u, v ; \
479     p0 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
480     p1 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \
481     p2 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
482     p3 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \
483     s = ( p0 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
484     t = ( p0 >> ( BITS_PER_DIGIT / 2 ) ) + ( p1 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \
485         ( p2 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) >> ( BITS_PER_DIGIT / 2 ) ) + \
486 	( s >> ( BITS_PER_DIGIT / 2 ) ) ; \
487     u = ( p1 >> ( BITS_PER_DIGIT / 2 ) ) + ( p2 >> ( BITS_PER_DIGIT / 2 ) ) + \
488 	( p3 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_high ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \
489 	( t >> ( BITS_PER_DIGIT / 2 ) ) ; \
490     v = ( p3 >> ( BITS_PER_DIGIT / 2 ) ) + ( ( add_high ) >> ( BITS_PER_DIGIT / 2 ) ) + \
491 	( u >> ( BITS_PER_DIGIT / 2 ) ) ; \
492     ( low ) = ( s & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( t << ( BITS_PER_DIGIT / 2 ) ) ; \
493     ( high ) = ( u & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( v << ( BITS_PER_DIGIT / 2 ) ) ; \
494     ( carry_out ) = v >> ( BITS_PER_DIGIT / 2 ) ; \
495     }
496 
497 /*  a * b + add_low + add_high * 2^BITS_PER_WORD + carry_in ->		    */
498 /*						    low +		    */
499 /*						    high * 2^BITS_PER_WORD  */
500 #define XMUL_XADD_W_C_IN( a, b, add_high, add_low, carry_in, high, low ) { \
501     U_WORD p0, p1, p2, p3, s, t ; \
502     p0 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
503     p1 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \
504     p2 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
505     p3 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \
506     s = ( p0 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \
507 	( ( carry_in ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
508     t = ( p0 >> ( BITS_PER_DIGIT / 2 ) ) + ( p1 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \
509         ( p2 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) >> ( BITS_PER_DIGIT / 2 ) ) + \
510 	( ( carry_in ) >> ( BITS_PER_DIGIT / 2 ) ) + ( s >> ( BITS_PER_DIGIT / 2 ) ) ; \
511     ( low ) = ( s & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( t << ( BITS_PER_DIGIT / 2 ) ) ; \
512     ( high ) = ( p1 >> ( BITS_PER_DIGIT / 2 ) ) + ( p2 >> ( BITS_PER_DIGIT / 2 ) ) + p3 + ( add_high ) + \
513 	( t >> ( BITS_PER_DIGIT / 2 ) ) ; \
514     }
515 
516 /*  a * b + add_low + add_high * 2^BITS_PER_WORD + carry_in ->		    */
517 /*					low +				    */
518 /*					high * 2^BITS_PER_WORD +	    */
519 /*					carry_out * 2^(2 * BITS_PER_WORD)   */
520 #define XMUL_XADDC_W_C_IN( a, b, add_high, add_low, carry_in, carry_out, high, low ) { \
521     U_WORD p0, p1, p2, p3, s, t, u, v ; \
522     p0 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
523     p1 = ( ( a ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \
524     p2 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
525     p3 = ( ( a ) >> ( BITS_PER_DIGIT / 2 ) ) * ( ( b ) >> ( BITS_PER_DIGIT / 2 ) ) ; \
526     s = ( p0 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \
527 	( ( carry_in ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) ; \
528     t = ( p0 >> ( BITS_PER_DIGIT / 2 ) ) + ( p1 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \
529         ( p2 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_low ) >> ( BITS_PER_DIGIT / 2 ) ) + \
530 	( ( carry_in ) >> ( BITS_PER_DIGIT / 2 ) ) + ( s >> ( BITS_PER_DIGIT / 2 ) ) ; \
531     u = ( p1 >> ( BITS_PER_DIGIT / 2 ) ) + ( p2 >> ( BITS_PER_DIGIT / 2 ) ) + \
532 	( p3 & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( ( add_high ) & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + \
533 	( t >> ( BITS_PER_DIGIT / 2 ) ) ; \
534     v = ( p3 >> ( BITS_PER_DIGIT / 2 ) ) + ( ( add_high ) >> ( BITS_PER_DIGIT / 2 ) ) + \
535 	( u >> ( BITS_PER_DIGIT / 2 ) ) ; \
536     ( low ) = ( s & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( t << ( BITS_PER_DIGIT / 2 ) ) ; \
537     ( high ) = ( u & MAKE_MASK ( BITS_PER_DIGIT / 2, 0 ) ) + ( v << ( BITS_PER_DIGIT / 2 ) ) ; \
538     ( carry_out )= v >> ( BITS_PER_DIGIT / 2 ) ; \
539     }
540 
541 
542