1 /* Copyright (C) 2007-2020 Free Software Foundation, Inc.
2 
3 This file is part of GCC.
4 
5 GCC is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free
7 Software Foundation; either version 3, or (at your option) any later
8 version.
9 
10 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 for more details.
14 
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18 
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22 <http://www.gnu.org/licenses/>.  */
23 
24 #include "bid_internal.h"
25 
26 
27 #if DECIMAL_CALL_BY_REFERENCE
28 void
bid64dq_add(UINT64 * pres,UINT64 * px,UINT128 * py _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM _EXC_INFO_PARAM)29 bid64dq_add (UINT64 * pres, UINT64 * px, UINT128 * py
30 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
31 	     _EXC_INFO_PARAM) {
32   UINT64 x = *px;
33 #if !DECIMAL_GLOBAL_ROUNDING
34   unsigned int rnd_mode = *prnd_mode;
35 #endif
36 #else
37 UINT64
38 bid64dq_add (UINT64 x, UINT128 y
39 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
40 	     _EXC_INFO_PARAM) {
41 #endif
42   UINT64 res = 0xbaddbaddbaddbaddull;
43   UINT128 x1;
44 
45 #if DECIMAL_CALL_BY_REFERENCE
46   bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
47   bid64qq_add (&res, &x1, py
48 	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
49 	       _EXC_INFO_ARG);
50 #else
51   x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
52   res = bid64qq_add (x1, y
53 		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
54 		     _EXC_INFO_ARG);
55 #endif
56   BID_RETURN (res);
57 }
58 
59 
60 #if DECIMAL_CALL_BY_REFERENCE
61 void
62 bid64qd_add (UINT64 * pres, UINT128 * px, UINT64 * py
63 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
64 	     _EXC_INFO_PARAM) {
65   UINT64 y = *py;
66 #if !DECIMAL_GLOBAL_ROUNDING
67   unsigned int rnd_mode = *prnd_mode;
68 #endif
69 #else
70 UINT64
71 bid64qd_add (UINT128 x, UINT64 y
72 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
73 	     _EXC_INFO_PARAM) {
74 #endif
75   UINT64 res = 0xbaddbaddbaddbaddull;
76   UINT128 y1;
77 
78 #if DECIMAL_CALL_BY_REFERENCE
79   bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
80   bid64qq_add (&res, px, &y1
81 	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
82 	       _EXC_INFO_ARG);
83 #else
84   y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
85   res = bid64qq_add (x, y1
86 		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
87 		     _EXC_INFO_ARG);
88 #endif
89   BID_RETURN (res);
90 }
91 
92 
93 #if DECIMAL_CALL_BY_REFERENCE
94 void
95 bid64qq_add (UINT64 * pres, UINT128 * px, UINT128 * py
96 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
97 	     _EXC_INFO_PARAM) {
98   UINT128 x = *px, y = *py;
99 #if !DECIMAL_GLOBAL_ROUNDING
100   unsigned int rnd_mode = *prnd_mode;
101 #endif
102 #else
103 UINT64
104 bid64qq_add (UINT128 x, UINT128 y
105 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
106 	     _EXC_INFO_PARAM) {
107 #endif
108 
109   UINT128 one = { {0x0000000000000001ull, 0x3040000000000000ull}
110   };
111   UINT64 res = 0xbaddbaddbaddbaddull;
112 
113   BID_SWAP128 (one);
114 #if DECIMAL_CALL_BY_REFERENCE
115   bid64qqq_fma (&res, &one, &x, &y
116 		_RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
117 		_EXC_INFO_ARG);
118 #else
119   res = bid64qqq_fma (one, x, y
120 		      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
121 		      _EXC_INFO_ARG);
122 #endif
123   BID_RETURN (res);
124 }
125 
126 
127 #if DECIMAL_CALL_BY_REFERENCE
128 void
129 bid128dd_add (UINT128 * pres, UINT64 * px, UINT64 * py
130 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
131 	      _EXC_INFO_PARAM) {
132   UINT64 x = *px, y = *py;
133 #if !DECIMAL_GLOBAL_ROUNDING
134   unsigned int rnd_mode = *prnd_mode;
135 #endif
136 #else
137 UINT128
138 bid128dd_add (UINT64 x, UINT64 y
139 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
140 	      _EXC_INFO_PARAM) {
141 #endif
142   UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
143   };
144   UINT128 x1, y1;
145 
146 #if DECIMAL_CALL_BY_REFERENCE
147   bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
148   bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
149   bid128_add (&res, &x1, &y1
150 	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
151 	      _EXC_INFO_ARG);
152 #else
153   x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
154   y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
155   res = bid128_add (x1, y1
156 		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
157 		    _EXC_INFO_ARG);
158 #endif
159   BID_RETURN (res);
160 }
161 
162 
163 #if DECIMAL_CALL_BY_REFERENCE
164 void
165 bid128dq_add (UINT128 * pres, UINT64 * px, UINT128 * py
166 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
167 	      _EXC_INFO_PARAM) {
168   UINT64 x = *px;
169 #if !DECIMAL_GLOBAL_ROUNDING
170   unsigned int rnd_mode = *prnd_mode;
171 #endif
172 #else
173 UINT128
174 bid128dq_add (UINT64 x, UINT128 y
175 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
176 	      _EXC_INFO_PARAM) {
177 #endif
178   UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
179   };
180   UINT128 x1;
181 
182 #if DECIMAL_CALL_BY_REFERENCE
183   bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
184   bid128_add (&res, &x1, py
185 	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
186 	      _EXC_INFO_ARG);
187 #else
188   x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
189   res = bid128_add (x1, y
190 		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
191 		    _EXC_INFO_ARG);
192 #endif
193   BID_RETURN (res);
194 }
195 
196 
197 #if DECIMAL_CALL_BY_REFERENCE
198 void
199 bid128qd_add (UINT128 * pres, UINT128 * px, UINT64 * py
200 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
201 	      _EXC_INFO_PARAM) {
202   UINT64 y = *py;
203 #if !DECIMAL_GLOBAL_ROUNDING
204   unsigned int rnd_mode = *prnd_mode;
205 #endif
206 #else
207 UINT128
208 bid128qd_add (UINT128 x, UINT64 y
209 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
210 	      _EXC_INFO_PARAM) {
211 #endif
212   UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
213   };
214   UINT128 y1;
215 
216 #if DECIMAL_CALL_BY_REFERENCE
217   bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
218   bid128_add (&res, px, &y1
219 	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
220 	      _EXC_INFO_ARG);
221 #else
222   y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
223   res = bid128_add (x, y1
224 		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
225 		    _EXC_INFO_ARG);
226 #endif
227   BID_RETURN (res);
228 }
229 
230 
231 // bid128_add stands for bid128qq_add
232 
233 
234 /*****************************************************************************
235  *  BID64/BID128 sub
236  ****************************************************************************/
237 
238 #if DECIMAL_CALL_BY_REFERENCE
239 void
240 bid64dq_sub (UINT64 * pres, UINT64 * px, UINT128 * py
241 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
242 	     _EXC_INFO_PARAM) {
243   UINT64 x = *px;
244 #if !DECIMAL_GLOBAL_ROUNDING
245   unsigned int rnd_mode = *prnd_mode;
246 #endif
247 #else
248 UINT64
249 bid64dq_sub (UINT64 x, UINT128 y
250 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
251 	     _EXC_INFO_PARAM) {
252 #endif
253   UINT64 res = 0xbaddbaddbaddbaddull;
254   UINT128 x1;
255 
256 #if DECIMAL_CALL_BY_REFERENCE
257   bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
258   bid64qq_sub (&res, &x1, py
259 	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
260 	       _EXC_INFO_ARG);
261 #else
262   x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
263   res = bid64qq_sub (x1, y
264 		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
265 		     _EXC_INFO_ARG);
266 #endif
267   BID_RETURN (res);
268 }
269 
270 
271 #if DECIMAL_CALL_BY_REFERENCE
272 void
273 bid64qd_sub (UINT64 * pres, UINT128 * px, UINT64 * py
274 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
275 	     _EXC_INFO_PARAM) {
276   UINT64 y = *py;
277 #if !DECIMAL_GLOBAL_ROUNDING
278   unsigned int rnd_mode = *prnd_mode;
279 #endif
280 #else
281 UINT64
282 bid64qd_sub (UINT128 x, UINT64 y
283 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
284 	     _EXC_INFO_PARAM) {
285 #endif
286   UINT64 res = 0xbaddbaddbaddbaddull;
287   UINT128 y1;
288 
289 #if DECIMAL_CALL_BY_REFERENCE
290   bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
291   bid64qq_sub (&res, px, &y1
292 	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
293 	       _EXC_INFO_ARG);
294 #else
295   y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
296   res = bid64qq_sub (x, y1
297 		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
298 		     _EXC_INFO_ARG);
299 #endif
300   BID_RETURN (res);
301 }
302 
303 
304 #if DECIMAL_CALL_BY_REFERENCE
305 void
306 bid64qq_sub (UINT64 * pres, UINT128 * px, UINT128 * py
307 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
308 	     _EXC_INFO_PARAM) {
309   UINT128 x = *px, y = *py;
310 #if !DECIMAL_GLOBAL_ROUNDING
311   unsigned int rnd_mode = *prnd_mode;
312 #endif
313 #else
314 UINT64
315 bid64qq_sub (UINT128 x, UINT128 y
316 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
317 	     _EXC_INFO_PARAM) {
318 #endif
319 
320   UINT128 one = { {0x0000000000000001ull, 0x3040000000000000ull}
321   };
322   UINT64 res = 0xbaddbaddbaddbaddull;
323   UINT64 y_sign;
324 
325   BID_SWAP128 (one);
326   if ((y.w[HIGH_128W] & MASK_NAN) != MASK_NAN) {	// y is not NAN
327     // change its sign
328     y_sign = y.w[HIGH_128W] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
329     if (y_sign)
330       y.w[HIGH_128W] = y.w[HIGH_128W] & 0x7fffffffffffffffull;
331     else
332       y.w[HIGH_128W] = y.w[HIGH_128W] | 0x8000000000000000ull;
333   }
334 #if DECIMAL_CALL_BY_REFERENCE
335   bid64qqq_fma (&res, &one, &x, &y
336 		_RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
337 		_EXC_INFO_ARG);
338 #else
339   res = bid64qqq_fma (one, x, y
340 		      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
341 		      _EXC_INFO_ARG);
342 #endif
343   BID_RETURN (res);
344 }
345 
346 
347 #if DECIMAL_CALL_BY_REFERENCE
348 void
349 bid128dd_sub (UINT128 * pres, UINT64 * px, UINT64 * py
350 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
351 	      _EXC_INFO_PARAM) {
352   UINT64 x = *px, y = *py;
353 #if !DECIMAL_GLOBAL_ROUNDING
354   unsigned int rnd_mode = *prnd_mode;
355 #endif
356 #else
357 UINT128
358 bid128dd_sub (UINT64 x, UINT64 y
359 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
360 	      _EXC_INFO_PARAM) {
361 #endif
362   UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
363   };
364   UINT128 x1, y1;
365 
366 #if DECIMAL_CALL_BY_REFERENCE
367   bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
368   bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
369   bid128_sub (&res, &x1, &y1
370 	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
371 	      _EXC_INFO_ARG);
372 #else
373   x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
374   y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
375   res = bid128_sub (x1, y1
376 		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
377 		    _EXC_INFO_ARG);
378 #endif
379   BID_RETURN (res);
380 }
381 
382 
383 #if DECIMAL_CALL_BY_REFERENCE
384 void
385 bid128dq_sub (UINT128 * pres, UINT64 * px, UINT128 * py
386 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
387 	      _EXC_INFO_PARAM) {
388   UINT64 x = *px;
389 #if !DECIMAL_GLOBAL_ROUNDING
390   unsigned int rnd_mode = *prnd_mode;
391 #endif
392 #else
393 UINT128
394 bid128dq_sub (UINT64 x, UINT128 y
395 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
396 	      _EXC_INFO_PARAM) {
397 #endif
398   UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
399   };
400   UINT128 x1;
401 
402 #if DECIMAL_CALL_BY_REFERENCE
403   bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
404   bid128_sub (&res, &x1, py
405 	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
406 	      _EXC_INFO_ARG);
407 #else
408   x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
409   res = bid128_sub (x1, y
410 		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
411 		    _EXC_INFO_ARG);
412 #endif
413   BID_RETURN (res);
414 }
415 
416 
417 #if DECIMAL_CALL_BY_REFERENCE
418 void
419 bid128qd_sub (UINT128 * pres, UINT128 * px, UINT64 * py
420 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
421 	      _EXC_INFO_PARAM) {
422   UINT64 y = *py;
423 #if !DECIMAL_GLOBAL_ROUNDING
424   unsigned int rnd_mode = *prnd_mode;
425 #endif
426 #else
427 UINT128
428 bid128qd_sub (UINT128 x, UINT64 y
429 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
430 	      _EXC_INFO_PARAM) {
431 #endif
432   UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
433   };
434   UINT128 y1;
435 
436 #if DECIMAL_CALL_BY_REFERENCE
437   bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
438   bid128_sub (&res, px, &y1
439 	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
440 	      _EXC_INFO_ARG);
441 #else
442   y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
443   res = bid128_sub (x, y1
444 		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
445 		    _EXC_INFO_ARG);
446 #endif
447   BID_RETURN (res);
448 }
449 
450 #if DECIMAL_CALL_BY_REFERENCE
451 void
452 bid128_add (UINT128 * pres, UINT128 * px, UINT128 * py
453 	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
454 	    _EXC_INFO_PARAM) {
455   UINT128 x = *px, y = *py;
456 #if !DECIMAL_GLOBAL_ROUNDING
457   unsigned int rnd_mode = *prnd_mode;
458 #endif
459 #else
460 UINT128
461 bid128_add (UINT128 x, UINT128 y
462 	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
463 	    _EXC_INFO_PARAM) {
464 #endif
465   UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
466   };
467   UINT64 x_sign, y_sign, tmp_sign;
468   UINT64 x_exp, y_exp, tmp_exp;	// e1 = x_exp, e2 = y_exp
469   UINT64 C1_hi, C2_hi, tmp_signif_hi;
470   UINT64 C1_lo, C2_lo, tmp_signif_lo;
471   // Note: C1.w[1], C1.w[0] represent C1_hi, C1_lo (all UINT64)
472   // Note: C2.w[1], C2.w[0] represent C2_hi, C2_lo (all UINT64)
473   UINT64 tmp64, tmp64A, tmp64B;
474   BID_UI64DOUBLE tmp1, tmp2;
475   int x_nr_bits, y_nr_bits;
476   int q1, q2, delta, scale, x1, ind, shift, tmp_inexact = 0;
477   UINT64 halfulp64;
478   UINT128 halfulp128;
479   UINT128 C1, C2;
480   UINT128 ten2m1;
481   UINT128 highf2star;		// top 128 bits in f2*; low 128 bits in R256[1], R256[0]
482   UINT256 P256, Q256, R256;
483   int is_inexact = 0, is_midpoint_lt_even = 0, is_midpoint_gt_even = 0;
484   int is_inexact_lt_midpoint = 0, is_inexact_gt_midpoint = 0;
485   int second_pass = 0;
486 
487   BID_SWAP128 (x);
488   BID_SWAP128 (y);
489   x_sign = x.w[1] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
490   y_sign = y.w[1] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
491 
492   // check for NaN or Infinity
493   if (((x.w[1] & MASK_SPECIAL) == MASK_SPECIAL)
494       || ((y.w[1] & MASK_SPECIAL) == MASK_SPECIAL)) {
495     // x is special or y is special
496     if ((x.w[1] & MASK_NAN) == MASK_NAN) {	// x is NAN
497       // check first for non-canonical NaN payload
498       if (((x.w[1] & 0x00003fffffffffffull) > 0x0000314dc6448d93ull) ||
499 	  (((x.w[1] & 0x00003fffffffffffull) == 0x0000314dc6448d93ull)
500 	   && (x.w[0] > 0x38c15b09ffffffffull))) {
501 	x.w[1] = x.w[1] & 0xffffc00000000000ull;
502 	x.w[0] = 0x0ull;
503       }
504       if ((x.w[1] & MASK_SNAN) == MASK_SNAN) {	// x is SNAN
505 	// set invalid flag
506 	*pfpsf |= INVALID_EXCEPTION;
507 	// return quiet (x)
508 	res.w[1] = x.w[1] & 0xfc003fffffffffffull;
509 	// clear out also G[6]-G[16]
510 	res.w[0] = x.w[0];
511       } else {	// x is QNaN
512 	// return x
513 	res.w[1] = x.w[1] & 0xfc003fffffffffffull;
514 	// clear out G[6]-G[16]
515 	res.w[0] = x.w[0];
516 	// if y = SNaN signal invalid exception
517 	if ((y.w[1] & MASK_SNAN) == MASK_SNAN) {
518 	  // set invalid flag
519 	  *pfpsf |= INVALID_EXCEPTION;
520 	}
521       }
522       BID_SWAP128 (res);
523       BID_RETURN (res);
524     } else if ((y.w[1] & MASK_NAN) == MASK_NAN) {	// y is NAN
525       // check first for non-canonical NaN payload
526       if (((y.w[1] & 0x00003fffffffffffull) > 0x0000314dc6448d93ull) ||
527 	  (((y.w[1] & 0x00003fffffffffffull) == 0x0000314dc6448d93ull)
528 	   && (y.w[0] > 0x38c15b09ffffffffull))) {
529 	y.w[1] = y.w[1] & 0xffffc00000000000ull;
530 	y.w[0] = 0x0ull;
531       }
532       if ((y.w[1] & MASK_SNAN) == MASK_SNAN) {	// y is SNAN
533 	// set invalid flag
534 	*pfpsf |= INVALID_EXCEPTION;
535 	// return quiet (y)
536 	res.w[1] = y.w[1] & 0xfc003fffffffffffull;
537 	// clear out also G[6]-G[16]
538 	res.w[0] = y.w[0];
539       } else {	// y is QNaN
540 	// return y
541 	res.w[1] = y.w[1] & 0xfc003fffffffffffull;
542 	// clear out G[6]-G[16]
543 	res.w[0] = y.w[0];
544       }
545       BID_SWAP128 (res);
546       BID_RETURN (res);
547     } else {	// neither x not y is NaN; at least one is infinity
548       if ((x.w[1] & MASK_ANY_INF) == MASK_INF) {	// x is infinity
549 	if ((y.w[1] & MASK_ANY_INF) == MASK_INF) {	// y is infinity
550 	  // if same sign, return either of them
551 	  if ((x.w[1] & MASK_SIGN) == (y.w[1] & MASK_SIGN)) {
552 	    res.w[1] = x_sign | MASK_INF;
553 	    res.w[0] = 0x0ull;
554 	  } else {	// x and y are infinities of opposite signs
555 	    // set invalid flag
556 	    *pfpsf |= INVALID_EXCEPTION;
557 	    // return QNaN Indefinite
558 	    res.w[1] = 0x7c00000000000000ull;
559 	    res.w[0] = 0x0000000000000000ull;
560 	  }
561 	} else {	// y is 0 or finite
562 	  // return x
563 	  res.w[1] = x_sign | MASK_INF;
564 	  res.w[0] = 0x0ull;
565 	}
566       } else {	// x is not NaN or infinity, so y must be infinity
567 	res.w[1] = y_sign | MASK_INF;
568 	res.w[0] = 0x0ull;
569       }
570       BID_SWAP128 (res);
571       BID_RETURN (res);
572     }
573   }
574   // unpack the arguments
575 
576   // unpack x
577   C1_hi = x.w[1] & MASK_COEFF;
578   C1_lo = x.w[0];
579   // test for non-canonical values:
580   // - values whose encoding begins with x00, x01, or x10 and whose
581   //   coefficient is larger than 10^34 -1, or
582   // - values whose encoding begins with x1100, x1101, x1110 (if NaNs
583   //   and infinitis were eliminated already this test is reduced to
584   //   checking for x10x)
585 
586   // x is not infinity; check for non-canonical values - treated as zero
587   if ((x.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
588     // G0_G1=11; non-canonical
589     x_exp = (x.w[1] << 2) & MASK_EXP;	// biased and shifted left 49 bits
590     C1_hi = 0;	// significand high
591     C1_lo = 0;	// significand low
592   } else {	// G0_G1 != 11
593     x_exp = x.w[1] & MASK_EXP;	// biased and shifted left 49 bits
594     if (C1_hi > 0x0001ed09bead87c0ull ||
595 	(C1_hi == 0x0001ed09bead87c0ull
596 	 && C1_lo > 0x378d8e63ffffffffull)) {
597       // x is non-canonical if coefficient is larger than 10^34 -1
598       C1_hi = 0;
599       C1_lo = 0;
600     } else {	// canonical
601       ;
602     }
603   }
604 
605   // unpack y
606   C2_hi = y.w[1] & MASK_COEFF;
607   C2_lo = y.w[0];
608   // y is not infinity; check for non-canonical values - treated as zero
609   if ((y.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
610     // G0_G1=11; non-canonical
611     y_exp = (y.w[1] << 2) & MASK_EXP;	// biased and shifted left 49 bits
612     C2_hi = 0;	// significand high
613     C2_lo = 0;	// significand low
614   } else {	// G0_G1 != 11
615     y_exp = y.w[1] & MASK_EXP;	// biased and shifted left 49 bits
616     if (C2_hi > 0x0001ed09bead87c0ull ||
617 	(C2_hi == 0x0001ed09bead87c0ull
618 	 && C2_lo > 0x378d8e63ffffffffull)) {
619       // y is non-canonical if coefficient is larger than 10^34 -1
620       C2_hi = 0;
621       C2_lo = 0;
622     } else {	// canonical
623       ;
624     }
625   }
626 
627   if ((C1_hi == 0x0ull) && (C1_lo == 0x0ull)) {
628     // x is 0 and y is not special
629     // if y is 0 return 0 with the smaller exponent
630     if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
631       if (x_exp < y_exp)
632 	res.w[1] = x_exp;
633       else
634 	res.w[1] = y_exp;
635       if (x_sign && y_sign)
636 	res.w[1] = res.w[1] | x_sign;	// both negative
637       else if (rnd_mode == ROUNDING_DOWN && x_sign != y_sign)
638 	res.w[1] = res.w[1] | 0x8000000000000000ull;	// -0
639       // else; // res = +0
640       res.w[0] = 0;
641     } else {
642       // for 0 + y return y, with the preferred exponent
643       if (y_exp <= x_exp) {
644 	res.w[1] = y.w[1];
645 	res.w[0] = y.w[0];
646       } else {	// if y_exp > x_exp
647 	// return (C2 * 10^scale) * 10^(y_exp - scale)
648 	// where scale = min (P34-q2, y_exp-x_exp)
649 	// determine q2 = nr. of decimal digits in y
650 	//  determine first the nr. of bits in y (y_nr_bits)
651 
652 	if (C2_hi == 0) {	// y_bits is the nr. of bits in C2_lo
653 	  if (C2_lo >= 0x0020000000000000ull) {	// y >= 2^53
654 	    // split the 64-bit value in two 32-bit halves to avoid
655 	    // rounding errors
656 	    if (C2_lo >= 0x0000000100000000ull) {	// y >= 2^32
657 	      tmp2.d = (double) (C2_lo >> 32);	// exact conversion
658 	      y_nr_bits =
659 		32 +
660 		((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
661 	    } else {	// y < 2^32
662 	      tmp2.d = (double) (C2_lo);	// exact conversion
663 	      y_nr_bits =
664 		((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
665 	    }
666 	  } else {	// if y < 2^53
667 	    tmp2.d = (double) C2_lo;	// exact conversion
668 	    y_nr_bits =
669 	      ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
670 	  }
671 	} else {	// C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
672 	  tmp2.d = (double) C2_hi;	// exact conversion
673 	  y_nr_bits =
674 	    64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
675 	}
676 	q2 = nr_digits[y_nr_bits].digits;
677 	if (q2 == 0) {
678 	  q2 = nr_digits[y_nr_bits].digits1;
679 	  if (C2_hi > nr_digits[y_nr_bits].threshold_hi ||
680 	      (C2_hi == nr_digits[y_nr_bits].threshold_hi &&
681 	       C2_lo >= nr_digits[y_nr_bits].threshold_lo))
682 	    q2++;
683 	}
684 	// return (C2 * 10^scale) * 10^(y_exp - scale)
685 	// where scale = min (P34-q2, y_exp-x_exp)
686 	scale = P34 - q2;
687 	ind = (y_exp - x_exp) >> 49;
688 	if (ind < scale)
689 	  scale = ind;
690 	if (scale == 0) {
691 	  res.w[1] = y.w[1];
692 	  res.w[0] = y.w[0];
693 	} else if (q2 <= 19) {	// y fits in 64 bits
694 	  if (scale <= 19) {	// 10^scale fits in 64 bits
695 	    // 64 x 64 C2_lo * ten2k64[scale]
696 	    __mul_64x64_to_128MACH (res, C2_lo, ten2k64[scale]);
697 	  } else {	// 10^scale fits in 128 bits
698 	    // 64 x 128 C2_lo * ten2k128[scale - 20]
699 	    __mul_128x64_to_128 (res, C2_lo, ten2k128[scale - 20]);
700 	  }
701 	} else {	// y fits in 128 bits, but 10^scale must fit in 64 bits
702 	  // 64 x 128 ten2k64[scale] * C2
703 	  C2.w[1] = C2_hi;
704 	  C2.w[0] = C2_lo;
705 	  __mul_128x64_to_128 (res, ten2k64[scale], C2);
706 	}
707 	// subtract scale from the exponent
708 	y_exp = y_exp - ((UINT64) scale << 49);
709 	res.w[1] = res.w[1] | y_sign | y_exp;
710       }
711     }
712     BID_SWAP128 (res);
713     BID_RETURN (res);
714   } else if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
715     // y is 0 and x is not special, and not zero
716     // for x + 0 return x, with the preferred exponent
717     if (x_exp <= y_exp) {
718       res.w[1] = x.w[1];
719       res.w[0] = x.w[0];
720     } else {	// if x_exp > y_exp
721       // return (C1 * 10^scale) * 10^(x_exp - scale)
722       // where scale = min (P34-q1, x_exp-y_exp)
723       // determine q1 = nr. of decimal digits in x
724       //  determine first the nr. of bits in x
725       if (C1_hi == 0) {	// x_bits is the nr. of bits in C1_lo
726 	if (C1_lo >= 0x0020000000000000ull) {	// x >= 2^53
727 	  // split the 64-bit value in two 32-bit halves to avoid
728 	  // rounding errors
729 	  if (C1_lo >= 0x0000000100000000ull) {	// x >= 2^32
730 	    tmp1.d = (double) (C1_lo >> 32);	// exact conversion
731 	    x_nr_bits =
732 	      32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) -
733 		    0x3ff);
734 	  } else {	// x < 2^32
735 	    tmp1.d = (double) (C1_lo);	// exact conversion
736 	    x_nr_bits =
737 	      ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
738 	  }
739 	} else {	// if x < 2^53
740 	  tmp1.d = (double) C1_lo;	// exact conversion
741 	  x_nr_bits =
742 	    ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
743 	}
744       } else {	// C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
745 	tmp1.d = (double) C1_hi;	// exact conversion
746 	x_nr_bits =
747 	  64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
748       }
749       q1 = nr_digits[x_nr_bits].digits;
750       if (q1 == 0) {
751 	q1 = nr_digits[x_nr_bits].digits1;
752 	if (C1_hi > nr_digits[x_nr_bits].threshold_hi ||
753 	    (C1_hi == nr_digits[x_nr_bits].threshold_hi &&
754 	     C1_lo >= nr_digits[x_nr_bits].threshold_lo))
755 	  q1++;
756       }
757       // return (C1 * 10^scale) * 10^(x_exp - scale)
758       // where scale = min (P34-q1, x_exp-y_exp)
759       scale = P34 - q1;
760       ind = (x_exp - y_exp) >> 49;
761       if (ind < scale)
762 	scale = ind;
763       if (scale == 0) {
764 	res.w[1] = x.w[1];
765 	res.w[0] = x.w[0];
766       } else if (q1 <= 19) {	// x fits in 64 bits
767 	if (scale <= 19) {	// 10^scale fits in 64 bits
768 	  // 64 x 64 C1_lo * ten2k64[scale]
769 	  __mul_64x64_to_128MACH (res, C1_lo, ten2k64[scale]);
770 	} else {	// 10^scale fits in 128 bits
771 	  // 64 x 128 C1_lo * ten2k128[scale - 20]
772 	  __mul_128x64_to_128 (res, C1_lo, ten2k128[scale - 20]);
773 	}
774       } else {	// x fits in 128 bits, but 10^scale must fit in 64 bits
775 	// 64 x 128 ten2k64[scale] * C1
776 	C1.w[1] = C1_hi;
777 	C1.w[0] = C1_lo;
778 	__mul_128x64_to_128 (res, ten2k64[scale], C1);
779       }
780       // subtract scale from the exponent
781       x_exp = x_exp - ((UINT64) scale << 49);
782       res.w[1] = res.w[1] | x_sign | x_exp;
783     }
784     BID_SWAP128 (res);
785     BID_RETURN (res);
786   } else {	// x and y are not canonical, not special, and are not zero
787     // note that the result may still be zero, and then it has to have the
788     // preferred exponent
789     if (x_exp < y_exp) {	// if exp_x < exp_y then swap x and y
790       tmp_sign = x_sign;
791       tmp_exp = x_exp;
792       tmp_signif_hi = C1_hi;
793       tmp_signif_lo = C1_lo;
794       x_sign = y_sign;
795       x_exp = y_exp;
796       C1_hi = C2_hi;
797       C1_lo = C2_lo;
798       y_sign = tmp_sign;
799       y_exp = tmp_exp;
800       C2_hi = tmp_signif_hi;
801       C2_lo = tmp_signif_lo;
802     }
803     // q1 = nr. of decimal digits in x
804     //  determine first the nr. of bits in x
805     if (C1_hi == 0) {	// x_bits is the nr. of bits in C1_lo
806       if (C1_lo >= 0x0020000000000000ull) {	// x >= 2^53
807 	//split the 64-bit value in two 32-bit halves to avoid rounding errors
808 	if (C1_lo >= 0x0000000100000000ull) {	// x >= 2^32
809 	  tmp1.d = (double) (C1_lo >> 32);	// exact conversion
810 	  x_nr_bits =
811 	    32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
812 	} else {	// x < 2^32
813 	  tmp1.d = (double) (C1_lo);	// exact conversion
814 	  x_nr_bits =
815 	    ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
816 	}
817       } else {	// if x < 2^53
818 	tmp1.d = (double) C1_lo;	// exact conversion
819 	x_nr_bits =
820 	  ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
821       }
822     } else {	// C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
823       tmp1.d = (double) C1_hi;	// exact conversion
824       x_nr_bits =
825 	64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
826     }
827 
828     q1 = nr_digits[x_nr_bits].digits;
829     if (q1 == 0) {
830       q1 = nr_digits[x_nr_bits].digits1;
831       if (C1_hi > nr_digits[x_nr_bits].threshold_hi ||
832 	  (C1_hi == nr_digits[x_nr_bits].threshold_hi &&
833 	   C1_lo >= nr_digits[x_nr_bits].threshold_lo))
834 	q1++;
835     }
836     // q2 = nr. of decimal digits in y
837     //  determine first the nr. of bits in y (y_nr_bits)
838     if (C2_hi == 0) {	// y_bits is the nr. of bits in C2_lo
839       if (C2_lo >= 0x0020000000000000ull) {	// y >= 2^53
840 	//split the 64-bit value in two 32-bit halves to avoid rounding errors
841 	if (C2_lo >= 0x0000000100000000ull) {	// y >= 2^32
842 	  tmp2.d = (double) (C2_lo >> 32);	// exact conversion
843 	  y_nr_bits =
844 	    32 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
845 	} else {	// y < 2^32
846 	  tmp2.d = (double) (C2_lo);	// exact conversion
847 	  y_nr_bits =
848 	    ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
849 	}
850       } else {	// if y < 2^53
851 	tmp2.d = (double) C2_lo;	// exact conversion
852 	y_nr_bits =
853 	  ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
854       }
855     } else {	// C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
856       tmp2.d = (double) C2_hi;	// exact conversion
857       y_nr_bits =
858 	64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
859     }
860 
861     q2 = nr_digits[y_nr_bits].digits;
862     if (q2 == 0) {
863       q2 = nr_digits[y_nr_bits].digits1;
864       if (C2_hi > nr_digits[y_nr_bits].threshold_hi ||
865 	  (C2_hi == nr_digits[y_nr_bits].threshold_hi &&
866 	   C2_lo >= nr_digits[y_nr_bits].threshold_lo))
867 	q2++;
868     }
869 
870     delta = q1 + (int) (x_exp >> 49) - q2 - (int) (y_exp >> 49);
871 
872     if (delta >= P34) {
873       // round the result directly because 0 < C2 < ulp (C1 * 10^(x_exp-e2))
874       // n = C1 * 10^e1 or n = C1 +/- 10^(q1-P34)) * 10^e1
875       // the result is inexact; the preferred exponent is the least possible
876 
877       if (delta >= P34 + 1) {
878 	// for RN the result is the operand with the larger magnitude,
879 	// possibly scaled up by 10^(P34-q1)
880 	// an overflow cannot occur in this case (rounding to nearest)
881 	if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
882 	  // Note: because delta >= P34+1 it is certain that
883 	  //     x_exp - ((UINT64)scale << 49) will stay above e_min
884 	  scale = P34 - q1;
885 	  if (q1 <= 19) {	// C1 fits in 64 bits
886 	    // 1 <= q1 <= 19 => 15 <= scale <= 33
887 	    if (scale <= 19) {	// 10^scale fits in 64 bits
888 	      __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
889 	    } else {	// if 20 <= scale <= 33
890 	      // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
891 	      // (C1 * 10^(scale-19)) fits in 64 bits
892 	      C1_lo = C1_lo * ten2k64[scale - 19];
893 	      __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
894 	    }
895 	  } else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
896 	    // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
897 	    C1.w[1] = C1_hi;
898 	    C1.w[0] = C1_lo;
899 	    // C1 = ten2k64[P34 - q1] * C1
900 	    __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
901 	  }
902 	  x_exp = x_exp - ((UINT64) scale << 49);
903 	  C1_hi = C1.w[1];
904 	  C1_lo = C1.w[0];
905 	}
906 	// some special cases arise: if delta = P34 + 1 and C1 = 10^(P34-1)
907 	// (after scaling) and x_sign != y_sign and C2 > 5*10^(q2-1) =>
908 	// subtract 1 ulp
909 	// Note: do this only for rounding to nearest; for other rounding
910 	// modes the correction will be applied next
911 	if ((rnd_mode == ROUNDING_TO_NEAREST
912 	     || rnd_mode == ROUNDING_TIES_AWAY) && delta == (P34 + 1)
913 	    && C1_hi == 0x0000314dc6448d93ull
914 	    && C1_lo == 0x38c15b0a00000000ull && x_sign != y_sign
915 	    && ((q2 <= 19 && C2_lo > midpoint64[q2 - 1]) || (q2 >= 20
916 							     && (C2_hi >
917 								 midpoint128
918 								 [q2 -
919 								  20].
920 								 w[1]
921 								 ||
922 								 (C2_hi
923 								  ==
924 								  midpoint128
925 								  [q2 -
926 								   20].
927 								  w[1]
928 								  &&
929 								  C2_lo
930 								  >
931 								  midpoint128
932 								  [q2 -
933 								   20].
934 								  w
935 								  [0])))))
936 	{
937 	  // C1 = 10^34 - 1 and decrement x_exp by 1 (no underflow possible)
938 	  C1_hi = 0x0001ed09bead87c0ull;
939 	  C1_lo = 0x378d8e63ffffffffull;
940 	  x_exp = x_exp - EXP_P1;
941 	}
942 	if (rnd_mode != ROUNDING_TO_NEAREST) {
943 	  if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
944 	      (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
945 	    // add 1 ulp and then check for overflow
946 	    C1_lo = C1_lo + 1;
947 	    if (C1_lo == 0) {	// rounding overflow in the low 64 bits
948 	      C1_hi = C1_hi + 1;
949 	    }
950 	    if (C1_hi == 0x0001ed09bead87c0ull
951 		&& C1_lo == 0x378d8e6400000000ull) {
952 	      // C1 = 10^34 => rounding overflow
953 	      C1_hi = 0x0000314dc6448d93ull;
954 	      C1_lo = 0x38c15b0a00000000ull;	// 10^33
955 	      x_exp = x_exp + EXP_P1;
956 	      if (x_exp == EXP_MAX_P1) {	// overflow
957 		C1_hi = 0x7800000000000000ull;	// +inf
958 		C1_lo = 0x0ull;
959 		x_exp = 0;	// x_sign is preserved
960 		// set overflow flag (the inexact flag was set too)
961 		*pfpsf |= OVERFLOW_EXCEPTION;
962 	      }
963 	    }
964 	  } else if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign) ||
965 		     (rnd_mode == ROUNDING_UP && x_sign && !y_sign) ||
966 		     (rnd_mode == ROUNDING_TO_ZERO
967 		      && x_sign != y_sign)) {
968 	    // subtract 1 ulp from C1
969 	    // Note: because delta >= P34 + 1 the result cannot be zero
970 	    C1_lo = C1_lo - 1;
971 	    if (C1_lo == 0xffffffffffffffffull)
972 	      C1_hi = C1_hi - 1;
973 	    // if the coefficient is 10^33 - 1 then make it 10^34 - 1 and
974 	    // decrease the exponent by 1 (because delta >= P34 + 1 the
975 	    // exponent will not become less than e_min)
976 	    // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
977 	    // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
978 	    if (C1_hi == 0x0000314dc6448d93ull
979 		&& C1_lo == 0x38c15b09ffffffffull) {
980 	      // make C1 = 10^34  - 1
981 	      C1_hi = 0x0001ed09bead87c0ull;
982 	      C1_lo = 0x378d8e63ffffffffull;
983 	      x_exp = x_exp - EXP_P1;
984 	    }
985 	  } else {
986 	    ;	// the result is already correct
987 	  }
988 	}
989 	// set the inexact flag
990 	*pfpsf |= INEXACT_EXCEPTION;
991 	// assemble the result
992 	res.w[1] = x_sign | x_exp | C1_hi;
993 	res.w[0] = C1_lo;
994       } else {	// delta = P34
995 	// in most cases, the smaller operand may be < or = or > 1/2 ulp of the
996 	// larger operand
997 	// however, the case C1 = 10^(q1-1) and x_sign != y_sign is special due
998 	// to accuracy loss after subtraction, and will be treated separately
999 	if (x_sign == y_sign || (q1 <= 20
1000 				 && (C1_hi != 0
1001 				     || C1_lo != ten2k64[q1 - 1]))
1002 	    || (q1 >= 21 && (C1_hi != ten2k128[q1 - 21].w[1]
1003 			     || C1_lo != ten2k128[q1 - 21].w[0]))) {
1004 	  // if x_sign == y_sign or C1 != 10^(q1-1)
1005 	  // compare C2 with 1/2 ulp = 5 * 10^(q2-1), the latter read from table
1006 	  // Note: cases q1<=19 and q1>=20 can be coalesced at some latency cost
1007 	  if (q2 <= 19) {	// C2 and 5*10^(q2-1) both fit in 64 bits
1008 	    halfulp64 = midpoint64[q2 - 1];	// 5 * 10^(q2-1)
1009 	    if (C2_lo < halfulp64) {	// n2 < 1/2 ulp (n1)
1010 	      // for RN the result is the operand with the larger magnitude,
1011 	      // possibly scaled up by 10^(P34-q1)
1012 	      // an overflow cannot occur in this case (rounding to nearest)
1013 	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
1014 		// Note: because delta = P34 it is certain that
1015 		//     x_exp - ((UINT64)scale << 49) will stay above e_min
1016 		scale = P34 - q1;
1017 		if (q1 <= 19) {	// C1 fits in 64 bits
1018 		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1019 		  if (scale <= 19) {	// 10^scale fits in 64 bits
1020 		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1021 		  } else {	// if 20 <= scale <= 33
1022 		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1023 		    // (C1 * 10^(scale-19)) fits in 64 bits
1024 		    C1_lo = C1_lo * ten2k64[scale - 19];
1025 		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1026 		  }
1027 		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1028 		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1029 		  C1.w[1] = C1_hi;
1030 		  C1.w[0] = C1_lo;
1031 		  // C1 = ten2k64[P34 - q1] * C1
1032 		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1033 		}
1034 		x_exp = x_exp - ((UINT64) scale << 49);
1035 		C1_hi = C1.w[1];
1036 		C1_lo = C1.w[0];
1037 	      }
1038 	      if (rnd_mode != ROUNDING_TO_NEAREST) {
1039 		if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
1040 		    (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
1041 		  // add 1 ulp and then check for overflow
1042 		  C1_lo = C1_lo + 1;
1043 		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1044 		    C1_hi = C1_hi + 1;
1045 		  }
1046 		  if (C1_hi == 0x0001ed09bead87c0ull
1047 		      && C1_lo == 0x378d8e6400000000ull) {
1048 		    // C1 = 10^34 => rounding overflow
1049 		    C1_hi = 0x0000314dc6448d93ull;
1050 		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
1051 		    x_exp = x_exp + EXP_P1;
1052 		    if (x_exp == EXP_MAX_P1) {	// overflow
1053 		      C1_hi = 0x7800000000000000ull;	// +inf
1054 		      C1_lo = 0x0ull;
1055 		      x_exp = 0;	// x_sign is preserved
1056 		      // set overflow flag (the inexact flag was set too)
1057 		      *pfpsf |= OVERFLOW_EXCEPTION;
1058 		    }
1059 		  }
1060 		} else
1061 		  if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
1062 		      || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1063 		      || (rnd_mode == ROUNDING_TO_ZERO
1064 			  && x_sign != y_sign)) {
1065 		  // subtract 1 ulp from C1
1066 		  // Note: because delta >= P34 + 1 the result cannot be zero
1067 		  C1_lo = C1_lo - 1;
1068 		  if (C1_lo == 0xffffffffffffffffull)
1069 		    C1_hi = C1_hi - 1;
1070 		  // if the coefficient is 10^33-1 then make it 10^34-1 and
1071 		  // decrease the exponent by 1 (because delta >= P34 + 1 the
1072 		  // exponent will not become less than e_min)
1073 		  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
1074 		  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
1075 		  if (C1_hi == 0x0000314dc6448d93ull
1076 		      && C1_lo == 0x38c15b09ffffffffull) {
1077 		    // make C1 = 10^34  - 1
1078 		    C1_hi = 0x0001ed09bead87c0ull;
1079 		    C1_lo = 0x378d8e63ffffffffull;
1080 		    x_exp = x_exp - EXP_P1;
1081 		  }
1082 		} else {
1083 		  ;	// the result is already correct
1084 		}
1085 	      }
1086 	      // set the inexact flag
1087 	      *pfpsf |= INEXACT_EXCEPTION;
1088 	      // assemble the result
1089 	      res.w[1] = x_sign | x_exp | C1_hi;
1090 	      res.w[0] = C1_lo;
1091 	    } else if ((C2_lo == halfulp64)
1092 		       && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
1093 	      // n2 = 1/2 ulp (n1) and C1 is even
1094 	      // the result is the operand with the larger magnitude,
1095 	      // possibly scaled up by 10^(P34-q1)
1096 	      // an overflow cannot occur in this case (rounding to nearest)
1097 	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
1098 		// Note: because delta = P34 it is certain that
1099 		//     x_exp - ((UINT64)scale << 49) will stay above e_min
1100 		scale = P34 - q1;
1101 		if (q1 <= 19) {	// C1 fits in 64 bits
1102 		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1103 		  if (scale <= 19) {	// 10^scale fits in 64 bits
1104 		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1105 		  } else {	// if 20 <= scale <= 33
1106 		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1107 		    // (C1 * 10^(scale-19)) fits in 64 bits
1108 		    C1_lo = C1_lo * ten2k64[scale - 19];
1109 		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1110 		  }
1111 		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1112 		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1113 		  C1.w[1] = C1_hi;
1114 		  C1.w[0] = C1_lo;
1115 		  // C1 = ten2k64[P34 - q1] * C1
1116 		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1117 		}
1118 		x_exp = x_exp - ((UINT64) scale << 49);
1119 		C1_hi = C1.w[1];
1120 		C1_lo = C1.w[0];
1121 	      }
1122 	      if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign == y_sign
1123 		   && (C1_lo & 0x01)) || (rnd_mode == ROUNDING_TIES_AWAY
1124 					  && x_sign == y_sign)
1125 		  || (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)
1126 		  || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)) {
1127 		// add 1 ulp and then check for overflow
1128 		C1_lo = C1_lo + 1;
1129 		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1130 		  C1_hi = C1_hi + 1;
1131 		}
1132 		if (C1_hi == 0x0001ed09bead87c0ull
1133 		    && C1_lo == 0x378d8e6400000000ull) {
1134 		  // C1 = 10^34 => rounding overflow
1135 		  C1_hi = 0x0000314dc6448d93ull;
1136 		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
1137 		  x_exp = x_exp + EXP_P1;
1138 		  if (x_exp == EXP_MAX_P1) {	// overflow
1139 		    C1_hi = 0x7800000000000000ull;	// +inf
1140 		    C1_lo = 0x0ull;
1141 		    x_exp = 0;	// x_sign is preserved
1142 		    // set overflow flag (the inexact flag was set too)
1143 		    *pfpsf |= OVERFLOW_EXCEPTION;
1144 		  }
1145 		}
1146 	      } else
1147 		if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign
1148 		     && (C1_lo & 0x01)) || (rnd_mode == ROUNDING_DOWN
1149 					    && !x_sign && y_sign)
1150 		    || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1151 		    || (rnd_mode == ROUNDING_TO_ZERO
1152 			&& x_sign != y_sign)) {
1153 		// subtract 1 ulp from C1
1154 		// Note: because delta >= P34 + 1 the result cannot be zero
1155 		C1_lo = C1_lo - 1;
1156 		if (C1_lo == 0xffffffffffffffffull)
1157 		  C1_hi = C1_hi - 1;
1158 		// if the coefficient is 10^33 - 1 then make it 10^34 - 1
1159 		// and decrease the exponent by 1 (because delta >= P34 + 1
1160 		// the exponent will not become less than e_min)
1161 		// 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
1162 		// 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
1163 		if (C1_hi == 0x0000314dc6448d93ull
1164 		    && C1_lo == 0x38c15b09ffffffffull) {
1165 		  // make C1 = 10^34  - 1
1166 		  C1_hi = 0x0001ed09bead87c0ull;
1167 		  C1_lo = 0x378d8e63ffffffffull;
1168 		  x_exp = x_exp - EXP_P1;
1169 		}
1170 	      } else {
1171 		;	// the result is already correct
1172 	      }
1173 	      // set the inexact flag
1174 	      *pfpsf |= INEXACT_EXCEPTION;
1175 	      // assemble the result
1176 	      res.w[1] = x_sign | x_exp | C1_hi;
1177 	      res.w[0] = C1_lo;
1178 	    } else {	// if C2_lo > halfulp64 ||
1179 	      // (C2_lo == halfulp64 && q1 == P34 && ((C1_lo & 0x1) == 1)), i.e.
1180 	      // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
1181 	      // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
1182 	      if (q1 < P34) {	// then 1 ulp = 10^(e1+q1-P34) < 10^e1
1183 		// Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
1184 		// because q1 < P34 we must first replace C1 by
1185 		// C1 * 10^(P34-q1), and must decrease the exponent by
1186 		// (P34-q1) (it will still be at least e_min)
1187 		scale = P34 - q1;
1188 		if (q1 <= 19) {	// C1 fits in 64 bits
1189 		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1190 		  if (scale <= 19) {	// 10^scale fits in 64 bits
1191 		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1192 		  } else {	// if 20 <= scale <= 33
1193 		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1194 		    // (C1 * 10^(scale-19)) fits in 64 bits
1195 		    C1_lo = C1_lo * ten2k64[scale - 19];
1196 		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1197 		  }
1198 		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1199 		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1200 		  C1.w[1] = C1_hi;
1201 		  C1.w[0] = C1_lo;
1202 		  // C1 = ten2k64[P34 - q1] * C1
1203 		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1204 		}
1205 		x_exp = x_exp - ((UINT64) scale << 49);
1206 		C1_hi = C1.w[1];
1207 		C1_lo = C1.w[0];
1208 		// check for rounding overflow
1209 		if (C1_hi == 0x0001ed09bead87c0ull
1210 		    && C1_lo == 0x378d8e6400000000ull) {
1211 		  // C1 = 10^34 => rounding overflow
1212 		  C1_hi = 0x0000314dc6448d93ull;
1213 		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
1214 		  x_exp = x_exp + EXP_P1;
1215 		}
1216 	      }
1217 	      if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign)
1218 		  || (rnd_mode == ROUNDING_TIES_AWAY && x_sign != y_sign
1219 		      && C2_lo != halfulp64)
1220 		  || (rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
1221 		  || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1222 		  || (rnd_mode == ROUNDING_TO_ZERO
1223 		      && x_sign != y_sign)) {
1224 		// the result is x - 1
1225 		// for RN n1 * n2 < 0; underflow not possible
1226 		C1_lo = C1_lo - 1;
1227 		if (C1_lo == 0xffffffffffffffffull)
1228 		  C1_hi--;
1229 		// check if we crossed into the lower decade
1230 		if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
1231 		  C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
1232 		  C1_lo = 0x378d8e63ffffffffull;
1233 		  x_exp = x_exp - EXP_P1;	// no underflow, because n1 >> n2
1234 		}
1235 	      } else
1236 		if ((rnd_mode == ROUNDING_TO_NEAREST
1237 		     && x_sign == y_sign)
1238 		    || (rnd_mode == ROUNDING_TIES_AWAY
1239 			&& x_sign == y_sign)
1240 		    || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
1241 		    || (rnd_mode == ROUNDING_UP && !x_sign
1242 			&& !y_sign)) {
1243 		// the result is x + 1
1244 		// for RN x_sign = y_sign, i.e. n1*n2 > 0
1245 		C1_lo = C1_lo + 1;
1246 		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1247 		  C1_hi = C1_hi + 1;
1248 		}
1249 		if (C1_hi == 0x0001ed09bead87c0ull
1250 		    && C1_lo == 0x378d8e6400000000ull) {
1251 		  // C1 = 10^34 => rounding overflow
1252 		  C1_hi = 0x0000314dc6448d93ull;
1253 		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
1254 		  x_exp = x_exp + EXP_P1;
1255 		  if (x_exp == EXP_MAX_P1) {	// overflow
1256 		    C1_hi = 0x7800000000000000ull;	// +inf
1257 		    C1_lo = 0x0ull;
1258 		    x_exp = 0;	// x_sign is preserved
1259 		    // set the overflow flag
1260 		    *pfpsf |= OVERFLOW_EXCEPTION;
1261 		  }
1262 		}
1263 	      } else {
1264 		;	// the result is x
1265 	      }
1266 	      // set the inexact flag
1267 	      *pfpsf |= INEXACT_EXCEPTION;
1268 	      // assemble the result
1269 	      res.w[1] = x_sign | x_exp | C1_hi;
1270 	      res.w[0] = C1_lo;
1271 	    }
1272 	  } else {	// if q2 >= 20 then 5*10^(q2-1) and C2 (the latter in
1273 	    // most cases) fit only in more than 64 bits
1274 	    halfulp128 = midpoint128[q2 - 20];	// 5 * 10^(q2-1)
1275 	    if ((C2_hi < halfulp128.w[1])
1276 		|| (C2_hi == halfulp128.w[1]
1277 		    && C2_lo < halfulp128.w[0])) {
1278 	      // n2 < 1/2 ulp (n1)
1279 	      // the result is the operand with the larger magnitude,
1280 	      // possibly scaled up by 10^(P34-q1)
1281 	      // an overflow cannot occur in this case (rounding to nearest)
1282 	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
1283 		// Note: because delta = P34 it is certain that
1284 		//     x_exp - ((UINT64)scale << 49) will stay above e_min
1285 		scale = P34 - q1;
1286 		if (q1 <= 19) {	// C1 fits in 64 bits
1287 		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1288 		  if (scale <= 19) {	// 10^scale fits in 64 bits
1289 		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1290 		  } else {	// if 20 <= scale <= 33
1291 		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1292 		    // (C1 * 10^(scale-19)) fits in 64 bits
1293 		    C1_lo = C1_lo * ten2k64[scale - 19];
1294 		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1295 		  }
1296 		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1297 		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1298 		  C1.w[1] = C1_hi;
1299 		  C1.w[0] = C1_lo;
1300 		  // C1 = ten2k64[P34 - q1] * C1
1301 		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1302 		}
1303 		C1_hi = C1.w[1];
1304 		C1_lo = C1.w[0];
1305 		x_exp = x_exp - ((UINT64) scale << 49);
1306 	      }
1307 	      if (rnd_mode != ROUNDING_TO_NEAREST) {
1308 		if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
1309 		    (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
1310 		  // add 1 ulp and then check for overflow
1311 		  C1_lo = C1_lo + 1;
1312 		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1313 		    C1_hi = C1_hi + 1;
1314 		  }
1315 		  if (C1_hi == 0x0001ed09bead87c0ull
1316 		      && C1_lo == 0x378d8e6400000000ull) {
1317 		    // C1 = 10^34 => rounding overflow
1318 		    C1_hi = 0x0000314dc6448d93ull;
1319 		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
1320 		    x_exp = x_exp + EXP_P1;
1321 		    if (x_exp == EXP_MAX_P1) {	// overflow
1322 		      C1_hi = 0x7800000000000000ull;	// +inf
1323 		      C1_lo = 0x0ull;
1324 		      x_exp = 0;	// x_sign is preserved
1325 		      // set overflow flag (the inexact flag was set too)
1326 		      *pfpsf |= OVERFLOW_EXCEPTION;
1327 		    }
1328 		  }
1329 		} else
1330 		  if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
1331 		      || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1332 		      || (rnd_mode == ROUNDING_TO_ZERO
1333 			  && x_sign != y_sign)) {
1334 		  // subtract 1 ulp from C1
1335 		  // Note: because delta >= P34 + 1 the result cannot be zero
1336 		  C1_lo = C1_lo - 1;
1337 		  if (C1_lo == 0xffffffffffffffffull)
1338 		    C1_hi = C1_hi - 1;
1339 		  // if the coefficient is 10^33-1 then make it 10^34-1 and
1340 		  // decrease the exponent by 1 (because delta >= P34 + 1 the
1341 		  // exponent will not become less than e_min)
1342 		  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
1343 		  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
1344 		  if (C1_hi == 0x0000314dc6448d93ull
1345 		      && C1_lo == 0x38c15b09ffffffffull) {
1346 		    // make C1 = 10^34  - 1
1347 		    C1_hi = 0x0001ed09bead87c0ull;
1348 		    C1_lo = 0x378d8e63ffffffffull;
1349 		    x_exp = x_exp - EXP_P1;
1350 		  }
1351 		} else {
1352 		  ;	// the result is already correct
1353 		}
1354 	      }
1355 	      // set the inexact flag
1356 	      *pfpsf |= INEXACT_EXCEPTION;
1357 	      // assemble the result
1358 	      res.w[1] = x_sign | x_exp | C1_hi;
1359 	      res.w[0] = C1_lo;
1360 	    } else if ((C2_hi == halfulp128.w[1]
1361 			&& C2_lo == halfulp128.w[0])
1362 		       && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
1363 	      // midpoint & lsb in C1 is 0
1364 	      // n2 = 1/2 ulp (n1) and C1 is even
1365 	      // the result is the operand with the larger magnitude,
1366 	      // possibly scaled up by 10^(P34-q1)
1367 	      // an overflow cannot occur in this case (rounding to nearest)
1368 	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
1369 		// Note: because delta = P34 it is certain that
1370 		//     x_exp - ((UINT64)scale << 49) will stay above e_min
1371 		scale = P34 - q1;
1372 		if (q1 <= 19) {	// C1 fits in 64 bits
1373 		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1374 		  if (scale <= 19) {	// 10^scale fits in 64 bits
1375 		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1376 		  } else {	// if 20 <= scale <= 33
1377 		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1378 		    // (C1 * 10^(scale-19)) fits in 64 bits
1379 		    C1_lo = C1_lo * ten2k64[scale - 19];
1380 		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1381 		  }
1382 		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1383 		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1384 		  C1.w[1] = C1_hi;
1385 		  C1.w[0] = C1_lo;
1386 		  // C1 = ten2k64[P34 - q1] * C1
1387 		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1388 		}
1389 		x_exp = x_exp - ((UINT64) scale << 49);
1390 		C1_hi = C1.w[1];
1391 		C1_lo = C1.w[0];
1392 	      }
1393 	      if (rnd_mode != ROUNDING_TO_NEAREST) {
1394 		if ((rnd_mode == ROUNDING_TIES_AWAY && x_sign == y_sign)
1395 		    || (rnd_mode == ROUNDING_UP && !y_sign)) {
1396 		  // add 1 ulp and then check for overflow
1397 		  C1_lo = C1_lo + 1;
1398 		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1399 		    C1_hi = C1_hi + 1;
1400 		  }
1401 		  if (C1_hi == 0x0001ed09bead87c0ull
1402 		      && C1_lo == 0x378d8e6400000000ull) {
1403 		    // C1 = 10^34 => rounding overflow
1404 		    C1_hi = 0x0000314dc6448d93ull;
1405 		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
1406 		    x_exp = x_exp + EXP_P1;
1407 		    if (x_exp == EXP_MAX_P1) {	// overflow
1408 		      C1_hi = 0x7800000000000000ull;	// +inf
1409 		      C1_lo = 0x0ull;
1410 		      x_exp = 0;	// x_sign is preserved
1411 		      // set overflow flag (the inexact flag was set too)
1412 		      *pfpsf |= OVERFLOW_EXCEPTION;
1413 		    }
1414 		  }
1415 		} else if ((rnd_mode == ROUNDING_DOWN && y_sign)
1416 			   || (rnd_mode == ROUNDING_TO_ZERO
1417 			       && x_sign != y_sign)) {
1418 		  // subtract 1 ulp from C1
1419 		  // Note: because delta >= P34 + 1 the result cannot be zero
1420 		  C1_lo = C1_lo - 1;
1421 		  if (C1_lo == 0xffffffffffffffffull)
1422 		    C1_hi = C1_hi - 1;
1423 		  // if the coefficient is 10^33 - 1 then make it 10^34 - 1
1424 		  // and decrease the exponent by 1 (because delta >= P34 + 1
1425 		  // the exponent will not become less than e_min)
1426 		  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
1427 		  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
1428 		  if (C1_hi == 0x0000314dc6448d93ull
1429 		      && C1_lo == 0x38c15b09ffffffffull) {
1430 		    // make C1 = 10^34  - 1
1431 		    C1_hi = 0x0001ed09bead87c0ull;
1432 		    C1_lo = 0x378d8e63ffffffffull;
1433 		    x_exp = x_exp - EXP_P1;
1434 		  }
1435 		} else {
1436 		  ;	// the result is already correct
1437 		}
1438 	      }
1439 	      // set the inexact flag
1440 	      *pfpsf |= INEXACT_EXCEPTION;
1441 	      // assemble the result
1442 	      res.w[1] = x_sign | x_exp | C1_hi;
1443 	      res.w[0] = C1_lo;
1444 	    } else {	// if C2 > halfulp128 ||
1445 	      // (C2 == halfulp128 && q1 == P34 && ((C1 & 0x1) == 1)), i.e.
1446 	      // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
1447 	      // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
1448 	      if (q1 < P34) {	// then 1 ulp = 10^(e1+q1-P34) < 10^e1
1449 		// Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
1450 		// because q1 < P34 we must first replace C1 by C1*10^(P34-q1),
1451 		// and must decrease the exponent by (P34-q1) (it will still be
1452 		// at least e_min)
1453 		scale = P34 - q1;
1454 		if (q1 <= 19) {	// C1 fits in 64 bits
1455 		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1456 		  if (scale <= 19) {	// 10^scale fits in 64 bits
1457 		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1458 		  } else {	// if 20 <= scale <= 33
1459 		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1460 		    // (C1 * 10^(scale-19)) fits in 64 bits
1461 		    C1_lo = C1_lo * ten2k64[scale - 19];
1462 		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1463 		  }
1464 		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1465 		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1466 		  C1.w[1] = C1_hi;
1467 		  C1.w[0] = C1_lo;
1468 		  // C1 = ten2k64[P34 - q1] * C1
1469 		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1470 		}
1471 		C1_hi = C1.w[1];
1472 		C1_lo = C1.w[0];
1473 		x_exp = x_exp - ((UINT64) scale << 49);
1474 	      }
1475 	      if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign)
1476 		  || (rnd_mode == ROUNDING_TIES_AWAY && x_sign != y_sign
1477 		      && (C2_hi != halfulp128.w[1]
1478 			  || C2_lo != halfulp128.w[0]))
1479 		  || (rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
1480 		  || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1481 		  || (rnd_mode == ROUNDING_TO_ZERO
1482 		      && x_sign != y_sign)) {
1483 		// the result is x - 1
1484 		// for RN n1 * n2 < 0; underflow not possible
1485 		C1_lo = C1_lo - 1;
1486 		if (C1_lo == 0xffffffffffffffffull)
1487 		  C1_hi--;
1488 		// check if we crossed into the lower decade
1489 		if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
1490 		  C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
1491 		  C1_lo = 0x378d8e63ffffffffull;
1492 		  x_exp = x_exp - EXP_P1;	// no underflow, because n1 >> n2
1493 		}
1494 	      } else
1495 		if ((rnd_mode == ROUNDING_TO_NEAREST
1496 		     && x_sign == y_sign)
1497 		    || (rnd_mode == ROUNDING_TIES_AWAY
1498 			&& x_sign == y_sign)
1499 		    || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
1500 		    || (rnd_mode == ROUNDING_UP && !x_sign
1501 			&& !y_sign)) {
1502 		// the result is x + 1
1503 		// for RN x_sign = y_sign, i.e. n1*n2 > 0
1504 		C1_lo = C1_lo + 1;
1505 		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1506 		  C1_hi = C1_hi + 1;
1507 		}
1508 		if (C1_hi == 0x0001ed09bead87c0ull
1509 		    && C1_lo == 0x378d8e6400000000ull) {
1510 		  // C1 = 10^34 => rounding overflow
1511 		  C1_hi = 0x0000314dc6448d93ull;
1512 		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
1513 		  x_exp = x_exp + EXP_P1;
1514 		  if (x_exp == EXP_MAX_P1) {	// overflow
1515 		    C1_hi = 0x7800000000000000ull;	// +inf
1516 		    C1_lo = 0x0ull;
1517 		    x_exp = 0;	// x_sign is preserved
1518 		    // set the overflow flag
1519 		    *pfpsf |= OVERFLOW_EXCEPTION;
1520 		  }
1521 		}
1522 	      } else {
1523 		;	// the result is x
1524 	      }
1525 	      // set the inexact flag
1526 	      *pfpsf |= INEXACT_EXCEPTION;
1527 	      // assemble the result
1528 	      res.w[1] = x_sign | x_exp | C1_hi;
1529 	      res.w[0] = C1_lo;
1530 	    }
1531 	  }	// end q1 >= 20
1532 	  // end case where C1 != 10^(q1-1)
1533 	} else {	// C1 = 10^(q1-1) and x_sign != y_sign
1534 	  // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
1535 	  // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34
1536 	  // where x1 = q2 - 1, 0 <= x1 <= P34 - 1
1537 	  // Because C1 = 10^(q1-1) and x_sign != y_sign, C' will have P34
1538 	  // digits and n = C' * 10^(e2+x1)
1539 	  // If the result has P34+1 digits, redo the steps above with x1+1
1540 	  // If the result has P34-1 digits or less, redo the steps above with
1541 	  // x1-1 but only if initially x1 >= 1
1542 	  // NOTE: these two steps can be improved, e.g we could guess if
1543 	  // P34+1 or P34-1 digits will be obtained by adding/subtracting
1544 	  // just the top 64 bits of the two operands
1545 	  // The result cannot be zero, and it cannot overflow
1546 	  x1 = q2 - 1;	// 0 <= x1 <= P34-1
1547 	  // Calculate C1 * 10^(e1-e2-x1) where 1 <= e1-e2-x1 <= P34
1548 	  // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
1549 	  scale = P34 - q1 + 1;	// scale=e1-e2-x1 = P34+1-q1; 1<=scale<=P34
1550 	  // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
1551 	  // but their product fits with certainty in 128 bits
1552 	  if (scale >= 20) {	//10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
1553 	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
1554 	  } else {	// if (scale >= 1
1555 	    // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
1556 	    if (q1 <= 19) {	// C1 fits in 64 bits
1557 	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
1558 	    } else {	// q1 >= 20
1559 	      C1.w[1] = C1_hi;
1560 	      C1.w[0] = C1_lo;
1561 	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
1562 	    }
1563 	  }
1564 	  tmp64 = C1.w[0];	// C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
1565 
1566 	  // now round C2 to q2-x1 = 1 decimal digit
1567 	  // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
1568 	  ind = x1 - 1;	// -1 <= ind <= P34 - 2
1569 	  if (ind >= 0) {	// if (x1 >= 1)
1570 	    C2.w[0] = C2_lo;
1571 	    C2.w[1] = C2_hi;
1572 	    if (ind <= 18) {
1573 	      C2.w[0] = C2.w[0] + midpoint64[ind];
1574 	      if (C2.w[0] < C2_lo)
1575 		C2.w[1]++;
1576 	    } else {	// 19 <= ind <= 32
1577 	      C2.w[0] = C2.w[0] + midpoint128[ind - 19].w[0];
1578 	      C2.w[1] = C2.w[1] + midpoint128[ind - 19].w[1];
1579 	      if (C2.w[0] < C2_lo)
1580 		C2.w[1]++;
1581 	    }
1582 	    // the approximation of 10^(-x1) was rounded up to 118 bits
1583 	    __mul_128x128_to_256 (R256, C2, ten2mk128[ind]);	// R256 = C2*, f2*
1584 	    // calculate C2* and f2*
1585 	    // C2* is actually floor(C2*) in this case
1586 	    // C2* and f2* need shifting and masking, as shown by
1587 	    // shiftright128[] and maskhigh128[]
1588 	    // the top Ex bits of 10^(-x1) are T* = ten2mk128trunc[ind], e.g.
1589 	    // if x1=1, T*=ten2mk128trunc[0]=0x19999999999999999999999999999999
1590 	    // if (0 < f2* < 10^(-x1)) then
1591 	    //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
1592 	    //       shift; C2* has p decimal digits, correct by Prop. 1)
1593 	    //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
1594 	    //       shift; C2* has p decimal digits, correct by Pr. 1)
1595 	    // else
1596 	    //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
1597 	    //       correct by Property 1)
1598 	    // n = C2* * 10^(e2+x1)
1599 
1600 	    if (ind <= 2) {
1601 	      highf2star.w[1] = 0x0;
1602 	      highf2star.w[0] = 0x0;	// low f2* ok
1603 	    } else if (ind <= 21) {
1604 	      highf2star.w[1] = 0x0;
1605 	      highf2star.w[0] = R256.w[2] & maskhigh128[ind];	// low f2* ok
1606 	    } else {
1607 	      highf2star.w[1] = R256.w[3] & maskhigh128[ind];
1608 	      highf2star.w[0] = R256.w[2];	// low f2* is ok
1609 	    }
1610 	    // shift right C2* by Ex-128 = shiftright128[ind]
1611 	    if (ind >= 3) {
1612 	      shift = shiftright128[ind];
1613 	      if (shift < 64) {	// 3 <= shift <= 63
1614 		R256.w[2] =
1615 		  (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
1616 		R256.w[3] = (R256.w[3] >> shift);
1617 	      } else {	// 66 <= shift <= 102
1618 		R256.w[2] = (R256.w[3] >> (shift - 64));
1619 		R256.w[3] = 0x0ULL;
1620 	      }
1621 	    }
1622 	    // redundant
1623 	    is_inexact_lt_midpoint = 0;
1624 	    is_inexact_gt_midpoint = 0;
1625 	    is_midpoint_lt_even = 0;
1626 	    is_midpoint_gt_even = 0;
1627 	    // determine inexactness of the rounding of C2*
1628 	    // (cannot be followed by a second rounding)
1629 	    // if (0 < f2* - 1/2 < 10^(-x1)) then
1630 	    //   the result is exact
1631 	    // else (if f2* - 1/2 > T* then)
1632 	    //   the result of is inexact
1633 	    if (ind <= 2) {
1634 	      if (R256.w[1] > 0x8000000000000000ull ||
1635 		  (R256.w[1] == 0x8000000000000000ull
1636 		   && R256.w[0] > 0x0ull)) {
1637 		// f2* > 1/2 and the result may be exact
1638 		tmp64A = R256.w[1] - 0x8000000000000000ull;	// f* - 1/2
1639 		if ((tmp64A > ten2mk128trunc[ind].w[1]
1640 		     || (tmp64A == ten2mk128trunc[ind].w[1]
1641 			 && R256.w[0] >= ten2mk128trunc[ind].w[0]))) {
1642 		  // set the inexact flag
1643 		  *pfpsf |= INEXACT_EXCEPTION;
1644 		  // this rounding is applied to C2 only!
1645 		  // x_sign != y_sign
1646 		  is_inexact_gt_midpoint = 1;
1647 		}	// else the result is exact
1648 		// rounding down, unless a midpoint in [ODD, EVEN]
1649 	      } else {	// the result is inexact; f2* <= 1/2
1650 		// set the inexact flag
1651 		*pfpsf |= INEXACT_EXCEPTION;
1652 		// this rounding is applied to C2 only!
1653 		// x_sign != y_sign
1654 		is_inexact_lt_midpoint = 1;
1655 	      }
1656 	    } else if (ind <= 21) {	// if 3 <= ind <= 21
1657 	      if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
1658 					    && highf2star.w[0] >
1659 					    onehalf128[ind])
1660 		  || (highf2star.w[1] == 0x0
1661 		      && highf2star.w[0] == onehalf128[ind]
1662 		      && (R256.w[1] || R256.w[0]))) {
1663 		// f2* > 1/2 and the result may be exact
1664 		// Calculate f2* - 1/2
1665 		tmp64A = highf2star.w[0] - onehalf128[ind];
1666 		tmp64B = highf2star.w[1];
1667 		if (tmp64A > highf2star.w[0])
1668 		  tmp64B--;
1669 		if (tmp64B || tmp64A
1670 		    || R256.w[1] > ten2mk128trunc[ind].w[1]
1671 		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
1672 			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
1673 		  // set the inexact flag
1674 		  *pfpsf |= INEXACT_EXCEPTION;
1675 		  // this rounding is applied to C2 only!
1676 		  // x_sign != y_sign
1677 		  is_inexact_gt_midpoint = 1;
1678 		}	// else the result is exact
1679 	      } else {	// the result is inexact; f2* <= 1/2
1680 		// set the inexact flag
1681 		*pfpsf |= INEXACT_EXCEPTION;
1682 		// this rounding is applied to C2 only!
1683 		// x_sign != y_sign
1684 		is_inexact_lt_midpoint = 1;
1685 	      }
1686 	    } else {	// if 22 <= ind <= 33
1687 	      if (highf2star.w[1] > onehalf128[ind]
1688 		  || (highf2star.w[1] == onehalf128[ind]
1689 		      && (highf2star.w[0] || R256.w[1]
1690 			  || R256.w[0]))) {
1691 		// f2* > 1/2 and the result may be exact
1692 		// Calculate f2* - 1/2
1693 		// tmp64A = highf2star.w[0];
1694 		tmp64B = highf2star.w[1] - onehalf128[ind];
1695 		if (tmp64B || highf2star.w[0]
1696 		    || R256.w[1] > ten2mk128trunc[ind].w[1]
1697 		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
1698 			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
1699 		  // set the inexact flag
1700 		  *pfpsf |= INEXACT_EXCEPTION;
1701 		  // this rounding is applied to C2 only!
1702 		  // x_sign != y_sign
1703 		  is_inexact_gt_midpoint = 1;
1704 		}	// else the result is exact
1705 	      } else {	// the result is inexact; f2* <= 1/2
1706 		// set the inexact flag
1707 		*pfpsf |= INEXACT_EXCEPTION;
1708 		// this rounding is applied to C2 only!
1709 		// x_sign != y_sign
1710 		is_inexact_lt_midpoint = 1;
1711 	      }
1712 	    }
1713 	    // check for midpoints after determining inexactness
1714 	    if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
1715 		&& (highf2star.w[0] == 0)
1716 		&& (R256.w[1] < ten2mk128trunc[ind].w[1]
1717 		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
1718 			&& R256.w[0] <= ten2mk128trunc[ind].w[0]))) {
1719 	      // the result is a midpoint
1720 	      if ((tmp64 + R256.w[2]) & 0x01) {	// MP in [EVEN, ODD]
1721 		// if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
1722 		R256.w[2]--;
1723 		if (R256.w[2] == 0xffffffffffffffffull)
1724 		  R256.w[3]--;
1725 		// this rounding is applied to C2 only!
1726 		// x_sign != y_sign
1727 		is_midpoint_lt_even = 1;
1728 		is_inexact_lt_midpoint = 0;
1729 		is_inexact_gt_midpoint = 0;
1730 	      } else {
1731 		// else MP in [ODD, EVEN]
1732 		// this rounding is applied to C2 only!
1733 		// x_sign != y_sign
1734 		is_midpoint_gt_even = 1;
1735 		is_inexact_lt_midpoint = 0;
1736 		is_inexact_gt_midpoint = 0;
1737 	      }
1738 	    }
1739 	  } else {	// if (ind == -1) only when x1 = 0
1740 	    R256.w[2] = C2_lo;
1741 	    R256.w[3] = C2_hi;
1742 	    is_midpoint_lt_even = 0;
1743 	    is_midpoint_gt_even = 0;
1744 	    is_inexact_lt_midpoint = 0;
1745 	    is_inexact_gt_midpoint = 0;
1746 	  }
1747 	  // and now subtract C1 * 10^(e1-e2-x1) - (C2 * 10^(-x1))rnd,P34
1748 	  // because x_sign != y_sign this last operation is exact
1749 	  C1.w[0] = C1.w[0] - R256.w[2];
1750 	  C1.w[1] = C1.w[1] - R256.w[3];
1751 	  if (C1.w[0] > tmp64)
1752 	    C1.w[1]--;	// borrow
1753 	  if (C1.w[1] >= 0x8000000000000000ull) {	// negative coefficient!
1754 	    C1.w[0] = ~C1.w[0];
1755 	    C1.w[0]++;
1756 	    C1.w[1] = ~C1.w[1];
1757 	    if (C1.w[0] == 0x0)
1758 	      C1.w[1]++;
1759 	    tmp_sign = y_sign;	// the result will have the sign of y
1760 	  } else {
1761 	    tmp_sign = x_sign;
1762 	  }
1763 	  // the difference has exactly P34 digits
1764 	  x_sign = tmp_sign;
1765 	  if (x1 >= 1)
1766 	    y_exp = y_exp + ((UINT64) x1 << 49);
1767 	  C1_hi = C1.w[1];
1768 	  C1_lo = C1.w[0];
1769 	  // general correction from RN to RA, RM, RP, RZ; result uses y_exp
1770 	  if (rnd_mode != ROUNDING_TO_NEAREST) {
1771 	    if ((!x_sign
1772 		 && ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint)
1773 		     ||
1774 		     ((rnd_mode == ROUNDING_TIES_AWAY
1775 		       || rnd_mode == ROUNDING_UP)
1776 		      && is_midpoint_gt_even))) || (x_sign
1777 						    &&
1778 						    ((rnd_mode ==
1779 						      ROUNDING_DOWN
1780 						      &&
1781 						      is_inexact_lt_midpoint)
1782 						     ||
1783 						     ((rnd_mode ==
1784 						       ROUNDING_TIES_AWAY
1785 						       || rnd_mode ==
1786 						       ROUNDING_DOWN)
1787 						      &&
1788 						      is_midpoint_gt_even))))
1789 	    {
1790 	      // C1 = C1 + 1
1791 	      C1_lo = C1_lo + 1;
1792 	      if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1793 		C1_hi = C1_hi + 1;
1794 	      }
1795 	      if (C1_hi == 0x0001ed09bead87c0ull
1796 		  && C1_lo == 0x378d8e6400000000ull) {
1797 		// C1 = 10^34 => rounding overflow
1798 		C1_hi = 0x0000314dc6448d93ull;
1799 		C1_lo = 0x38c15b0a00000000ull;	// 10^33
1800 		y_exp = y_exp + EXP_P1;
1801 	      }
1802 	    } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
1803 		       &&
1804 		       ((x_sign
1805 			 && (rnd_mode == ROUNDING_UP
1806 			     || rnd_mode == ROUNDING_TO_ZERO))
1807 			|| (!x_sign
1808 			    && (rnd_mode == ROUNDING_DOWN
1809 				|| rnd_mode == ROUNDING_TO_ZERO)))) {
1810 	      // C1 = C1 - 1
1811 	      C1_lo = C1_lo - 1;
1812 	      if (C1_lo == 0xffffffffffffffffull)
1813 		C1_hi--;
1814 	      // check if we crossed into the lower decade
1815 	      if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
1816 		C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
1817 		C1_lo = 0x378d8e63ffffffffull;
1818 		y_exp = y_exp - EXP_P1;
1819 		// no underflow, because delta + q2 >= P34 + 1
1820 	      }
1821 	    } else {
1822 	      ;	// exact, the result is already correct
1823 	    }
1824 	  }
1825 	  // assemble the result
1826 	  res.w[1] = x_sign | y_exp | C1_hi;
1827 	  res.w[0] = C1_lo;
1828 	}
1829       }	// end delta = P34
1830     } else {	// if (|delta| <= P34 - 1)
1831       if (delta >= 0) {	// if (0 <= delta <= P34 - 1)
1832 	if (delta <= P34 - 1 - q2) {
1833 	  // calculate C' directly; the result is exact
1834 	  // in this case 1<=q1<=P34-1, 1<=q2<=P34-1 and 0 <= e1-e2 <= P34-2
1835 	  // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
1836 	  // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
1837 	  // but their product fits with certainty in 128 bits (actually in 113)
1838 	  scale = delta - q1 + q2;	// scale = (int)(e1 >> 49) - (int)(e2 >> 49)
1839 
1840 	  if (scale >= 20) {	// 10^(e1-e2) does not fit in 64 bits, but C1 does
1841 	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
1842 	    C1_hi = C1.w[1];
1843 	    C1_lo = C1.w[0];
1844 	  } else if (scale >= 1) {
1845 	    // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
1846 	    if (q1 <= 19) {	// C1 fits in 64 bits
1847 	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
1848 	    } else {	// q1 >= 20
1849 	      C1.w[1] = C1_hi;
1850 	      C1.w[0] = C1_lo;
1851 	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
1852 	    }
1853 	    C1_hi = C1.w[1];
1854 	    C1_lo = C1.w[0];
1855 	  } else {	// if (scale == 0) C1 is unchanged
1856 	    C1.w[0] = C1_lo;	// C1.w[1] = C1_hi;
1857 	  }
1858 	  // now add C2
1859 	  if (x_sign == y_sign) {
1860 	    // the result cannot overflow
1861 	    C1_lo = C1_lo + C2_lo;
1862 	    C1_hi = C1_hi + C2_hi;
1863 	    if (C1_lo < C1.w[0])
1864 	      C1_hi++;
1865 	  } else {	// if x_sign != y_sign
1866 	    C1_lo = C1_lo - C2_lo;
1867 	    C1_hi = C1_hi - C2_hi;
1868 	    if (C1_lo > C1.w[0])
1869 	      C1_hi--;
1870 	    // the result can be zero, but it cannot overflow
1871 	    if (C1_lo == 0 && C1_hi == 0) {
1872 	      // assemble the result
1873 	      if (x_exp < y_exp)
1874 		res.w[1] = x_exp;
1875 	      else
1876 		res.w[1] = y_exp;
1877 	      res.w[0] = 0;
1878 	      if (rnd_mode == ROUNDING_DOWN) {
1879 		res.w[1] |= 0x8000000000000000ull;
1880 	      }
1881 	      BID_SWAP128 (res);
1882 	      BID_RETURN (res);
1883 	    }
1884 	    if (C1_hi >= 0x8000000000000000ull) {	// negative coefficient!
1885 	      C1_lo = ~C1_lo;
1886 	      C1_lo++;
1887 	      C1_hi = ~C1_hi;
1888 	      if (C1_lo == 0x0)
1889 		C1_hi++;
1890 	      x_sign = y_sign;	// the result will have the sign of y
1891 	    }
1892 	  }
1893 	  // assemble the result
1894 	  res.w[1] = x_sign | y_exp | C1_hi;
1895 	  res.w[0] = C1_lo;
1896 	} else if (delta == P34 - q2) {
1897 	  // calculate C' directly; the result may be inexact if it requires
1898 	  // P34+1 decimal digits; in this case the 'cutoff' point for addition
1899 	  // is at the position of the lsb of C2, so 0 <= e1-e2 <= P34-1
1900 	  // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
1901 	  // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
1902 	  // but their product fits with certainty in 128 bits (actually in 113)
1903 	  scale = delta - q1 + q2;	// scale = (int)(e1 >> 49) - (int)(e2 >> 49)
1904 	  if (scale >= 20) {	// 10^(e1-e2) does not fit in 64 bits, but C1 does
1905 	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
1906 	  } else if (scale >= 1) {
1907 	    // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
1908 	    if (q1 <= 19) {	// C1 fits in 64 bits
1909 	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
1910 	    } else {	// q1 >= 20
1911 	      C1.w[1] = C1_hi;
1912 	      C1.w[0] = C1_lo;
1913 	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
1914 	    }
1915 	  } else {	// if (scale == 0) C1 is unchanged
1916 	    C1.w[1] = C1_hi;
1917 	    C1.w[0] = C1_lo;	// only the low part is necessary
1918 	  }
1919 	  C1_hi = C1.w[1];
1920 	  C1_lo = C1.w[0];
1921 	  // now add C2
1922 	  if (x_sign == y_sign) {
1923 	    // the result can overflow!
1924 	    C1_lo = C1_lo + C2_lo;
1925 	    C1_hi = C1_hi + C2_hi;
1926 	    if (C1_lo < C1.w[0])
1927 	      C1_hi++;
1928 	    // test for overflow, possible only when C1 >= 10^34
1929 	    if (C1_hi > 0x0001ed09bead87c0ull || (C1_hi == 0x0001ed09bead87c0ull && C1_lo >= 0x378d8e6400000000ull)) {	// C1 >= 10^34
1930 	      // in this case q = P34 + 1 and x = q - P34 = 1, so multiply
1931 	      // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1
1932 	      // decimal digits
1933 	      // Calculate C'' = C' + 1/2 * 10^x
1934 	      if (C1_lo >= 0xfffffffffffffffbull) {	// low half add has carry
1935 		C1_lo = C1_lo + 5;
1936 		C1_hi = C1_hi + 1;
1937 	      } else {
1938 		C1_lo = C1_lo + 5;
1939 	      }
1940 	      // the approximation of 10^(-1) was rounded up to 118 bits
1941 	      // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
1942 	      // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
1943 	      C1.w[1] = C1_hi;
1944 	      C1.w[0] = C1_lo;	// C''
1945 	      ten2m1.w[1] = 0x1999999999999999ull;
1946 	      ten2m1.w[0] = 0x9999999999999a00ull;
1947 	      __mul_128x128_to_256 (P256, C1, ten2m1);	// P256 = C*, f*
1948 	      // C* is actually floor(C*) in this case
1949 	      // the top Ex = 128 bits of 10^(-1) are
1950 	      // T* = 0x00199999999999999999999999999999
1951 	      // if (0 < f* < 10^(-x)) then
1952 	      //   if floor(C*) is even then C = floor(C*) - logical right
1953 	      //       shift; C has p decimal digits, correct by Prop. 1)
1954 	      //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
1955 	      //       shift; C has p decimal digits, correct by Pr. 1)
1956 	      // else
1957 	      //   C = floor(C*) (logical right shift; C has p decimal digits,
1958 	      //       correct by Property 1)
1959 	      // n = C * 10^(e2+x)
1960 	      if ((P256.w[1] || P256.w[0])
1961 		  && (P256.w[1] < 0x1999999999999999ull
1962 		      || (P256.w[1] == 0x1999999999999999ull
1963 			  && P256.w[0] <= 0x9999999999999999ull))) {
1964 		// the result is a midpoint
1965 		if (P256.w[2] & 0x01) {
1966 		  is_midpoint_gt_even = 1;
1967 		  // if floor(C*) is odd C = floor(C*) - 1; the result is not 0
1968 		  P256.w[2]--;
1969 		  if (P256.w[2] == 0xffffffffffffffffull)
1970 		    P256.w[3]--;
1971 		} else {
1972 		  is_midpoint_lt_even = 1;
1973 		}
1974 	      }
1975 	      // n = Cstar * 10^(e2+1)
1976 	      y_exp = y_exp + EXP_P1;
1977 	      // C* != 10^P because C* has P34 digits
1978 	      // check for overflow
1979 	      if (y_exp == EXP_MAX_P1
1980 		  && (rnd_mode == ROUNDING_TO_NEAREST
1981 		      || rnd_mode == ROUNDING_TIES_AWAY)) {
1982 		// overflow for RN
1983 		res.w[1] = x_sign | 0x7800000000000000ull;	// +/-inf
1984 		res.w[0] = 0x0ull;
1985 		// set the inexact flag
1986 		*pfpsf |= INEXACT_EXCEPTION;
1987 		// set the overflow flag
1988 		*pfpsf |= OVERFLOW_EXCEPTION;
1989 		BID_SWAP128 (res);
1990 		BID_RETURN (res);
1991 	      }
1992 	      // if (0 < f* - 1/2 < 10^(-x)) then
1993 	      //   the result of the addition is exact
1994 	      // else
1995 	      //   the result of the addition is inexact
1996 	      if (P256.w[1] > 0x8000000000000000ull || (P256.w[1] == 0x8000000000000000ull && P256.w[0] > 0x0ull)) {	// the result may be exact
1997 		tmp64 = P256.w[1] - 0x8000000000000000ull;	// f* - 1/2
1998 		if ((tmp64 > 0x1999999999999999ull
1999 		     || (tmp64 == 0x1999999999999999ull
2000 			 && P256.w[0] >= 0x9999999999999999ull))) {
2001 		  // set the inexact flag
2002 		  *pfpsf |= INEXACT_EXCEPTION;
2003 		  is_inexact = 1;
2004 		}	// else the result is exact
2005 	      } else {	// the result is inexact
2006 		// set the inexact flag
2007 		*pfpsf |= INEXACT_EXCEPTION;
2008 		is_inexact = 1;
2009 	      }
2010 	      C1_hi = P256.w[3];
2011 	      C1_lo = P256.w[2];
2012 	      if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
2013 		is_inexact_lt_midpoint = is_inexact
2014 		  && (P256.w[1] & 0x8000000000000000ull);
2015 		is_inexact_gt_midpoint = is_inexact
2016 		  && !(P256.w[1] & 0x8000000000000000ull);
2017 	      }
2018 	      // general correction from RN to RA, RM, RP, RZ;
2019 	      // result uses y_exp
2020 	      if (rnd_mode != ROUNDING_TO_NEAREST) {
2021 		if ((!x_sign
2022 		     &&
2023 		     ((rnd_mode == ROUNDING_UP
2024 		       && is_inexact_lt_midpoint)
2025 		      ||
2026 		      ((rnd_mode == ROUNDING_TIES_AWAY
2027 			|| rnd_mode == ROUNDING_UP)
2028 		       && is_midpoint_gt_even))) || (x_sign
2029 						     &&
2030 						     ((rnd_mode ==
2031 						       ROUNDING_DOWN
2032 						       &&
2033 						       is_inexact_lt_midpoint)
2034 						      ||
2035 						      ((rnd_mode ==
2036 							ROUNDING_TIES_AWAY
2037 							|| rnd_mode ==
2038 							ROUNDING_DOWN)
2039 						       &&
2040 						       is_midpoint_gt_even))))
2041 		{
2042 		  // C1 = C1 + 1
2043 		  C1_lo = C1_lo + 1;
2044 		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
2045 		    C1_hi = C1_hi + 1;
2046 		  }
2047 		  if (C1_hi == 0x0001ed09bead87c0ull
2048 		      && C1_lo == 0x378d8e6400000000ull) {
2049 		    // C1 = 10^34 => rounding overflow
2050 		    C1_hi = 0x0000314dc6448d93ull;
2051 		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
2052 		    y_exp = y_exp + EXP_P1;
2053 		  }
2054 		} else
2055 		  if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
2056 		      &&
2057 		      ((x_sign
2058 			&& (rnd_mode == ROUNDING_UP
2059 			    || rnd_mode == ROUNDING_TO_ZERO))
2060 		       || (!x_sign
2061 			   && (rnd_mode == ROUNDING_DOWN
2062 			       || rnd_mode == ROUNDING_TO_ZERO)))) {
2063 		  // C1 = C1 - 1
2064 		  C1_lo = C1_lo - 1;
2065 		  if (C1_lo == 0xffffffffffffffffull)
2066 		    C1_hi--;
2067 		  // check if we crossed into the lower decade
2068 		  if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
2069 		    C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
2070 		    C1_lo = 0x378d8e63ffffffffull;
2071 		    y_exp = y_exp - EXP_P1;
2072 		    // no underflow, because delta + q2 >= P34 + 1
2073 		  }
2074 		} else {
2075 		  ;	// exact, the result is already correct
2076 		}
2077 		// in all cases check for overflow (RN and RA solved already)
2078 		if (y_exp == EXP_MAX_P1) {	// overflow
2079 		  if ((rnd_mode == ROUNDING_DOWN && x_sign) ||	// RM and res < 0
2080 		      (rnd_mode == ROUNDING_UP && !x_sign)) {	// RP and res > 0
2081 		    C1_hi = 0x7800000000000000ull;	// +inf
2082 		    C1_lo = 0x0ull;
2083 		  } else {	// RM and res > 0, RP and res < 0, or RZ
2084 		    C1_hi = 0x5fffed09bead87c0ull;
2085 		    C1_lo = 0x378d8e63ffffffffull;
2086 		  }
2087 		  y_exp = 0;	// x_sign is preserved
2088 		  // set the inexact flag (in case the exact addition was exact)
2089 		  *pfpsf |= INEXACT_EXCEPTION;
2090 		  // set the overflow flag
2091 		  *pfpsf |= OVERFLOW_EXCEPTION;
2092 		}
2093 	      }
2094 	    }	// else if (C1 < 10^34) then C1 is the coeff.; the result is exact
2095 	  } else {	// if x_sign != y_sign the result is exact
2096 	    C1_lo = C1_lo - C2_lo;
2097 	    C1_hi = C1_hi - C2_hi;
2098 	    if (C1_lo > C1.w[0])
2099 	      C1_hi--;
2100 	    // the result can be zero, but it cannot overflow
2101 	    if (C1_lo == 0 && C1_hi == 0) {
2102 	      // assemble the result
2103 	      if (x_exp < y_exp)
2104 		res.w[1] = x_exp;
2105 	      else
2106 		res.w[1] = y_exp;
2107 	      res.w[0] = 0;
2108 	      if (rnd_mode == ROUNDING_DOWN) {
2109 		res.w[1] |= 0x8000000000000000ull;
2110 	      }
2111 	      BID_SWAP128 (res);
2112 	      BID_RETURN (res);
2113 	    }
2114 	    if (C1_hi >= 0x8000000000000000ull) {	// negative coefficient!
2115 	      C1_lo = ~C1_lo;
2116 	      C1_lo++;
2117 	      C1_hi = ~C1_hi;
2118 	      if (C1_lo == 0x0)
2119 		C1_hi++;
2120 	      x_sign = y_sign;	// the result will have the sign of y
2121 	    }
2122 	  }
2123 	  // assemble the result
2124 	  res.w[1] = x_sign | y_exp | C1_hi;
2125 	  res.w[0] = C1_lo;
2126 	} else {	// if (delta >= P34 + 1 - q2)
2127 	  // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
2128 	  // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34
2129 	  // where x1 = q1 + e1 - e2 - P34, 1 <= x1 <= P34 - 1
2130 	  // In most cases C' will have P34 digits, and n = C' * 10^(e2+x1)
2131 	  // If the result has P34+1 digits, redo the steps above with x1+1
2132 	  // If the result has P34-1 digits or less, redo the steps above with
2133 	  // x1-1 but only if initially x1 >= 1
2134 	  // NOTE: these two steps can be improved, e.g we could guess if
2135 	  // P34+1 or P34-1 digits will be obtained by adding/subtracting just
2136 	  // the top 64 bits of the two operands
2137 	  // The result cannot be zero, but it can overflow
2138 	  x1 = delta + q2 - P34;	// 1 <= x1 <= P34-1
2139 	roundC2:
2140 	  // Calculate C1 * 10^(e1-e2-x1) where 0 <= e1-e2-x1 <= P34 - 1
2141 	  // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
2142 	  scale = delta - q1 + q2 - x1;	// scale = e1 - e2 - x1 = P34 - q1
2143 	  // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
2144 	  // but their product fits with certainty in 128 bits (actually in 113)
2145 	  if (scale >= 20) {	//10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
2146 	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
2147 	  } else if (scale >= 1) {
2148 	    // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
2149 	    if (q1 <= 19) {	// C1 fits in 64 bits
2150 	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
2151 	    } else {	// q1 >= 20
2152 	      C1.w[1] = C1_hi;
2153 	      C1.w[0] = C1_lo;
2154 	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
2155 	    }
2156 	  } else {	// if (scale == 0) C1 is unchanged
2157 	    C1.w[1] = C1_hi;
2158 	    C1.w[0] = C1_lo;
2159 	  }
2160 	  tmp64 = C1.w[0];	// C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
2161 
2162 	  // now round C2 to q2-x1 decimal digits, where 1<=x1<=q2-1<=P34-1
2163 	  // (but if we got here a second time after x1 = x1 - 1, then
2164 	  // x1 >= 0; note that for x1 = 0 C2 is unchanged)
2165 	  // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
2166 	  ind = x1 - 1;	// 0 <= ind <= q2-2<=P34-2=32; but note that if x1 = 0
2167 	  // during a second pass, then ind = -1
2168 	  if (ind >= 0) {	// if (x1 >= 1)
2169 	    C2.w[0] = C2_lo;
2170 	    C2.w[1] = C2_hi;
2171 	    if (ind <= 18) {
2172 	      C2.w[0] = C2.w[0] + midpoint64[ind];
2173 	      if (C2.w[0] < C2_lo)
2174 		C2.w[1]++;
2175 	    } else {	// 19 <= ind <= 32
2176 	      C2.w[0] = C2.w[0] + midpoint128[ind - 19].w[0];
2177 	      C2.w[1] = C2.w[1] + midpoint128[ind - 19].w[1];
2178 	      if (C2.w[0] < C2_lo)
2179 		C2.w[1]++;
2180 	    }
2181 	    // the approximation of 10^(-x1) was rounded up to 118 bits
2182 	    __mul_128x128_to_256 (R256, C2, ten2mk128[ind]);	// R256 = C2*, f2*
2183 	    // calculate C2* and f2*
2184 	    // C2* is actually floor(C2*) in this case
2185 	    // C2* and f2* need shifting and masking, as shown by
2186 	    // shiftright128[] and maskhigh128[]
2187 	    // the top Ex bits of 10^(-x1) are T* = ten2mk128trunc[ind], e.g.
2188 	    // if x1=1, T*=ten2mk128trunc[0]=0x19999999999999999999999999999999
2189 	    // if (0 < f2* < 10^(-x1)) then
2190 	    //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
2191 	    //       shift; C2* has p decimal digits, correct by Prop. 1)
2192 	    //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
2193 	    //       shift; C2* has p decimal digits, correct by Pr. 1)
2194 	    // else
2195 	    //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
2196 	    //       correct by Property 1)
2197 	    // n = C2* * 10^(e2+x1)
2198 
2199 	    if (ind <= 2) {
2200 	      highf2star.w[1] = 0x0;
2201 	      highf2star.w[0] = 0x0;	// low f2* ok
2202 	    } else if (ind <= 21) {
2203 	      highf2star.w[1] = 0x0;
2204 	      highf2star.w[0] = R256.w[2] & maskhigh128[ind];	// low f2* ok
2205 	    } else {
2206 	      highf2star.w[1] = R256.w[3] & maskhigh128[ind];
2207 	      highf2star.w[0] = R256.w[2];	// low f2* is ok
2208 	    }
2209 	    // shift right C2* by Ex-128 = shiftright128[ind]
2210 	    if (ind >= 3) {
2211 	      shift = shiftright128[ind];
2212 	      if (shift < 64) {	// 3 <= shift <= 63
2213 		R256.w[2] =
2214 		  (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
2215 		R256.w[3] = (R256.w[3] >> shift);
2216 	      } else {	// 66 <= shift <= 102
2217 		R256.w[2] = (R256.w[3] >> (shift - 64));
2218 		R256.w[3] = 0x0ULL;
2219 	      }
2220 	    }
2221 	    if (second_pass) {
2222 	      is_inexact_lt_midpoint = 0;
2223 	      is_inexact_gt_midpoint = 0;
2224 	      is_midpoint_lt_even = 0;
2225 	      is_midpoint_gt_even = 0;
2226 	    }
2227 	    // determine inexactness of the rounding of C2* (this may be
2228 	    // followed by a second rounding only if we get P34+1
2229 	    // decimal digits)
2230 	    // if (0 < f2* - 1/2 < 10^(-x1)) then
2231 	    //   the result is exact
2232 	    // else (if f2* - 1/2 > T* then)
2233 	    //   the result of is inexact
2234 	    if (ind <= 2) {
2235 	      if (R256.w[1] > 0x8000000000000000ull ||
2236 		  (R256.w[1] == 0x8000000000000000ull
2237 		   && R256.w[0] > 0x0ull)) {
2238 		// f2* > 1/2 and the result may be exact
2239 		tmp64A = R256.w[1] - 0x8000000000000000ull;	// f* - 1/2
2240 		if ((tmp64A > ten2mk128trunc[ind].w[1]
2241 		     || (tmp64A == ten2mk128trunc[ind].w[1]
2242 			 && R256.w[0] >= ten2mk128trunc[ind].w[0]))) {
2243 		  // set the inexact flag
2244 		  // *pfpsf |= INEXACT_EXCEPTION;
2245 		  tmp_inexact = 1;	// may be set again during a second pass
2246 		  // this rounding is applied to C2 only!
2247 		  if (x_sign == y_sign)
2248 		    is_inexact_lt_midpoint = 1;
2249 		  else	// if (x_sign != y_sign)
2250 		    is_inexact_gt_midpoint = 1;
2251 		}	// else the result is exact
2252 		// rounding down, unless a midpoint in [ODD, EVEN]
2253 	      } else {	// the result is inexact; f2* <= 1/2
2254 		// set the inexact flag
2255 		// *pfpsf |= INEXACT_EXCEPTION;
2256 		tmp_inexact = 1;	// just in case we will round a second time
2257 		// rounding up, unless a midpoint in [EVEN, ODD]
2258 		// this rounding is applied to C2 only!
2259 		if (x_sign == y_sign)
2260 		  is_inexact_gt_midpoint = 1;
2261 		else	// if (x_sign != y_sign)
2262 		  is_inexact_lt_midpoint = 1;
2263 	      }
2264 	    } else if (ind <= 21) {	// if 3 <= ind <= 21
2265 	      if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
2266 					    && highf2star.w[0] >
2267 					    onehalf128[ind])
2268 		  || (highf2star.w[1] == 0x0
2269 		      && highf2star.w[0] == onehalf128[ind]
2270 		      && (R256.w[1] || R256.w[0]))) {
2271 		// f2* > 1/2 and the result may be exact
2272 		// Calculate f2* - 1/2
2273 		tmp64A = highf2star.w[0] - onehalf128[ind];
2274 		tmp64B = highf2star.w[1];
2275 		if (tmp64A > highf2star.w[0])
2276 		  tmp64B--;
2277 		if (tmp64B || tmp64A
2278 		    || R256.w[1] > ten2mk128trunc[ind].w[1]
2279 		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
2280 			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
2281 		  // set the inexact flag
2282 		  // *pfpsf |= INEXACT_EXCEPTION;
2283 		  tmp_inexact = 1;	// may be set again during a second pass
2284 		  // this rounding is applied to C2 only!
2285 		  if (x_sign == y_sign)
2286 		    is_inexact_lt_midpoint = 1;
2287 		  else	// if (x_sign != y_sign)
2288 		    is_inexact_gt_midpoint = 1;
2289 		}	// else the result is exact
2290 	      } else {	// the result is inexact; f2* <= 1/2
2291 		// set the inexact flag
2292 		// *pfpsf |= INEXACT_EXCEPTION;
2293 		tmp_inexact = 1;	// may be set again during a second pass
2294 		// rounding up, unless a midpoint in [EVEN, ODD]
2295 		// this rounding is applied to C2 only!
2296 		if (x_sign == y_sign)
2297 		  is_inexact_gt_midpoint = 1;
2298 		else	// if (x_sign != y_sign)
2299 		  is_inexact_lt_midpoint = 1;
2300 	      }
2301 	    } else {	// if 22 <= ind <= 33
2302 	      if (highf2star.w[1] > onehalf128[ind]
2303 		  || (highf2star.w[1] == onehalf128[ind]
2304 		      && (highf2star.w[0] || R256.w[1]
2305 			  || R256.w[0]))) {
2306 		// f2* > 1/2 and the result may be exact
2307 		// Calculate f2* - 1/2
2308 		// tmp64A = highf2star.w[0];
2309 		tmp64B = highf2star.w[1] - onehalf128[ind];
2310 		if (tmp64B || highf2star.w[0]
2311 		    || R256.w[1] > ten2mk128trunc[ind].w[1]
2312 		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
2313 			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
2314 		  // set the inexact flag
2315 		  // *pfpsf |= INEXACT_EXCEPTION;
2316 		  tmp_inexact = 1;	// may be set again during a second pass
2317 		  // this rounding is applied to C2 only!
2318 		  if (x_sign == y_sign)
2319 		    is_inexact_lt_midpoint = 1;
2320 		  else	// if (x_sign != y_sign)
2321 		    is_inexact_gt_midpoint = 1;
2322 		}	// else the result is exact
2323 	      } else {	// the result is inexact; f2* <= 1/2
2324 		// set the inexact flag
2325 		// *pfpsf |= INEXACT_EXCEPTION;
2326 		tmp_inexact = 1;	// may be set again during a second pass
2327 		// rounding up, unless a midpoint in [EVEN, ODD]
2328 		// this rounding is applied to C2 only!
2329 		if (x_sign == y_sign)
2330 		  is_inexact_gt_midpoint = 1;
2331 		else	// if (x_sign != y_sign)
2332 		  is_inexact_lt_midpoint = 1;
2333 	      }
2334 	    }
2335 	    // check for midpoints
2336 	    if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
2337 		&& (highf2star.w[0] == 0)
2338 		&& (R256.w[1] < ten2mk128trunc[ind].w[1]
2339 		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
2340 			&& R256.w[0] <= ten2mk128trunc[ind].w[0]))) {
2341 	      // the result is a midpoint
2342 	      if ((tmp64 + R256.w[2]) & 0x01) {	// MP in [EVEN, ODD]
2343 		// if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
2344 		R256.w[2]--;
2345 		if (R256.w[2] == 0xffffffffffffffffull)
2346 		  R256.w[3]--;
2347 		// this rounding is applied to C2 only!
2348 		if (x_sign == y_sign)
2349 		  is_midpoint_gt_even = 1;
2350 		else	// if (x_sign != y_sign)
2351 		  is_midpoint_lt_even = 1;
2352 		is_inexact_lt_midpoint = 0;
2353 		is_inexact_gt_midpoint = 0;
2354 	      } else {
2355 		// else MP in [ODD, EVEN]
2356 		// this rounding is applied to C2 only!
2357 		if (x_sign == y_sign)
2358 		  is_midpoint_lt_even = 1;
2359 		else	// if (x_sign != y_sign)
2360 		  is_midpoint_gt_even = 1;
2361 		is_inexact_lt_midpoint = 0;
2362 		is_inexact_gt_midpoint = 0;
2363 	      }
2364 	    }
2365 	    // end if (ind >= 0)
2366 	  } else {	// if (ind == -1); only during a 2nd pass, and when x1 = 0
2367 	    R256.w[2] = C2_lo;
2368 	    R256.w[3] = C2_hi;
2369 	    tmp_inexact = 0;
2370 	    // to correct a possible setting to 1 from 1st pass
2371 	    if (second_pass) {
2372 	      is_midpoint_lt_even = 0;
2373 	      is_midpoint_gt_even = 0;
2374 	      is_inexact_lt_midpoint = 0;
2375 	      is_inexact_gt_midpoint = 0;
2376 	    }
2377 	  }
2378 	  // and now add/subtract C1 * 10^(e1-e2-x1) +/- (C2 * 10^(-x1))rnd,P34
2379 	  if (x_sign == y_sign) {	// addition; could overflow
2380 	    // no second pass is possible this way (only for x_sign != y_sign)
2381 	    C1.w[0] = C1.w[0] + R256.w[2];
2382 	    C1.w[1] = C1.w[1] + R256.w[3];
2383 	    if (C1.w[0] < tmp64)
2384 	      C1.w[1]++;	// carry
2385 	    // if the sum has P34+1 digits, i.e. C1>=10^34 redo the calculation
2386 	    // with x1=x1+1
2387 	    if (C1.w[1] > 0x0001ed09bead87c0ull || (C1.w[1] == 0x0001ed09bead87c0ull && C1.w[0] >= 0x378d8e6400000000ull)) {	// C1 >= 10^34
2388 	      // chop off one more digit from the sum, but make sure there is
2389 	      // no double-rounding error (see table - double rounding logic)
2390 	      // now round C1 from P34+1 to P34 decimal digits
2391 	      // C1' = C1 + 1/2 * 10 = C1 + 5
2392 	      if (C1.w[0] >= 0xfffffffffffffffbull) {	// low half add has carry
2393 		C1.w[0] = C1.w[0] + 5;
2394 		C1.w[1] = C1.w[1] + 1;
2395 	      } else {
2396 		C1.w[0] = C1.w[0] + 5;
2397 	      }
2398 	      // the approximation of 10^(-1) was rounded up to 118 bits
2399 	      __mul_128x128_to_256 (Q256, C1, ten2mk128[0]);	// Q256 = C1*, f1*
2400 	      // C1* is actually floor(C1*) in this case
2401 	      // the top 128 bits of 10^(-1) are
2402 	      // T* = ten2mk128trunc[0]=0x19999999999999999999999999999999
2403 	      // if (0 < f1* < 10^(-1)) then
2404 	      //   if floor(C1*) is even then C1* = floor(C1*) - logical right
2405 	      //       shift; C1* has p decimal digits, correct by Prop. 1)
2406 	      //   else if floor(C1*) is odd C1* = floor(C1*) - 1 (logical right
2407 	      //       shift; C1* has p decimal digits, correct by Pr. 1)
2408 	      // else
2409 	      //   C1* = floor(C1*) (logical right shift; C has p decimal digits
2410 	      //       correct by Property 1)
2411 	      // n = C1* * 10^(e2+x1+1)
2412 	      if ((Q256.w[1] || Q256.w[0])
2413 		  && (Q256.w[1] < ten2mk128trunc[0].w[1]
2414 		      || (Q256.w[1] == ten2mk128trunc[0].w[1]
2415 			  && Q256.w[0] <= ten2mk128trunc[0].w[0]))) {
2416 		// the result is a midpoint
2417 		if (is_inexact_lt_midpoint) {	// for the 1st rounding
2418 		  is_inexact_gt_midpoint = 1;
2419 		  is_inexact_lt_midpoint = 0;
2420 		  is_midpoint_gt_even = 0;
2421 		  is_midpoint_lt_even = 0;
2422 		} else if (is_inexact_gt_midpoint) {	// for the 1st rounding
2423 		  Q256.w[2]--;
2424 		  if (Q256.w[2] == 0xffffffffffffffffull)
2425 		    Q256.w[3]--;
2426 		  is_inexact_gt_midpoint = 0;
2427 		  is_inexact_lt_midpoint = 1;
2428 		  is_midpoint_gt_even = 0;
2429 		  is_midpoint_lt_even = 0;
2430 		} else if (is_midpoint_gt_even) {	// for the 1st rounding
2431 		  // Note: cannot have is_midpoint_lt_even
2432 		  is_inexact_gt_midpoint = 0;
2433 		  is_inexact_lt_midpoint = 1;
2434 		  is_midpoint_gt_even = 0;
2435 		  is_midpoint_lt_even = 0;
2436 		} else {	// the first rounding must have been exact
2437 		  if (Q256.w[2] & 0x01) {	// MP in [EVEN, ODD]
2438 		    // the truncated result is correct
2439 		    Q256.w[2]--;
2440 		    if (Q256.w[2] == 0xffffffffffffffffull)
2441 		      Q256.w[3]--;
2442 		    is_inexact_gt_midpoint = 0;
2443 		    is_inexact_lt_midpoint = 0;
2444 		    is_midpoint_gt_even = 1;
2445 		    is_midpoint_lt_even = 0;
2446 		  } else {	// MP in [ODD, EVEN]
2447 		    is_inexact_gt_midpoint = 0;
2448 		    is_inexact_lt_midpoint = 0;
2449 		    is_midpoint_gt_even = 0;
2450 		    is_midpoint_lt_even = 1;
2451 		  }
2452 		}
2453 		tmp_inexact = 1;	// in all cases
2454 	      } else {	// the result is not a midpoint
2455 		// determine inexactness of the rounding of C1 (the sum C1+C2*)
2456 		// if (0 < f1* - 1/2 < 10^(-1)) then
2457 		//   the result is exact
2458 		// else (if f1* - 1/2 > T* then)
2459 		//   the result of is inexact
2460 		// ind = 0
2461 		if (Q256.w[1] > 0x8000000000000000ull
2462 		    || (Q256.w[1] == 0x8000000000000000ull
2463 			&& Q256.w[0] > 0x0ull)) {
2464 		  // f1* > 1/2 and the result may be exact
2465 		  Q256.w[1] = Q256.w[1] - 0x8000000000000000ull;	// f1* - 1/2
2466 		  if ((Q256.w[1] > ten2mk128trunc[0].w[1]
2467 		       || (Q256.w[1] == ten2mk128trunc[0].w[1]
2468 			   && Q256.w[0] > ten2mk128trunc[0].w[0]))) {
2469 		    is_inexact_gt_midpoint = 0;
2470 		    is_inexact_lt_midpoint = 1;
2471 		    is_midpoint_gt_even = 0;
2472 		    is_midpoint_lt_even = 0;
2473 		    // set the inexact flag
2474 		    tmp_inexact = 1;
2475 		    // *pfpsf |= INEXACT_EXCEPTION;
2476 		  } else {	// else the result is exact for the 2nd rounding
2477 		    if (tmp_inexact) {	// if the previous rounding was inexact
2478 		      if (is_midpoint_lt_even) {
2479 			is_inexact_gt_midpoint = 1;
2480 			is_midpoint_lt_even = 0;
2481 		      } else if (is_midpoint_gt_even) {
2482 			is_inexact_lt_midpoint = 1;
2483 			is_midpoint_gt_even = 0;
2484 		      } else {
2485 			;	// no change
2486 		      }
2487 		    }
2488 		  }
2489 		  // rounding down, unless a midpoint in [ODD, EVEN]
2490 		} else {	// the result is inexact; f1* <= 1/2
2491 		  is_inexact_gt_midpoint = 1;
2492 		  is_inexact_lt_midpoint = 0;
2493 		  is_midpoint_gt_even = 0;
2494 		  is_midpoint_lt_even = 0;
2495 		  // set the inexact flag
2496 		  tmp_inexact = 1;
2497 		  // *pfpsf |= INEXACT_EXCEPTION;
2498 		}
2499 	      }	// end 'the result is not a midpoint'
2500 	      // n = C1 * 10^(e2+x1)
2501 	      C1.w[1] = Q256.w[3];
2502 	      C1.w[0] = Q256.w[2];
2503 	      y_exp = y_exp + ((UINT64) (x1 + 1) << 49);
2504 	    } else {	// C1 < 10^34
2505 	      // C1.w[1] and C1.w[0] already set
2506 	      // n = C1 * 10^(e2+x1)
2507 	      y_exp = y_exp + ((UINT64) x1 << 49);
2508 	    }
2509 	    // check for overflow
2510 	    if (y_exp == EXP_MAX_P1
2511 		&& (rnd_mode == ROUNDING_TO_NEAREST
2512 		    || rnd_mode == ROUNDING_TIES_AWAY)) {
2513 	      res.w[1] = 0x7800000000000000ull | x_sign;	// +/-inf
2514 	      res.w[0] = 0x0ull;
2515 	      // set the inexact flag
2516 	      *pfpsf |= INEXACT_EXCEPTION;
2517 	      // set the overflow flag
2518 	      *pfpsf |= OVERFLOW_EXCEPTION;
2519 	      BID_SWAP128 (res);
2520 	      BID_RETURN (res);
2521 	    }	// else no overflow
2522 	  } else {	// if x_sign != y_sign the result of this subtract. is exact
2523 	    C1.w[0] = C1.w[0] - R256.w[2];
2524 	    C1.w[1] = C1.w[1] - R256.w[3];
2525 	    if (C1.w[0] > tmp64)
2526 	      C1.w[1]--;	// borrow
2527 	    if (C1.w[1] >= 0x8000000000000000ull) {	// negative coefficient!
2528 	      C1.w[0] = ~C1.w[0];
2529 	      C1.w[0]++;
2530 	      C1.w[1] = ~C1.w[1];
2531 	      if (C1.w[0] == 0x0)
2532 		C1.w[1]++;
2533 	      tmp_sign = y_sign;
2534 	      // the result will have the sign of y if last rnd
2535 	    } else {
2536 	      tmp_sign = x_sign;
2537 	    }
2538 	    // if the difference has P34-1 digits or less, i.e. C1 < 10^33 then
2539 	    //   redo the calculation with x1=x1-1;
2540 	    // redo the calculation also if C1 = 10^33 and
2541 	    //   (is_inexact_gt_midpoint or is_midpoint_lt_even);
2542 	    //   (the last part should have really been
2543 	    //   (is_inexact_lt_midpoint or is_midpoint_gt_even) from
2544 	    //    the rounding of C2, but the position flags have been reversed)
2545 	    // 10^33 = 0x0000314dc6448d93 0x38c15b0a00000000
2546 	    if ((C1.w[1] < 0x0000314dc6448d93ull || (C1.w[1] == 0x0000314dc6448d93ull && C1.w[0] < 0x38c15b0a00000000ull)) || (C1.w[1] == 0x0000314dc6448d93ull && C1.w[0] == 0x38c15b0a00000000ull && (is_inexact_gt_midpoint || is_midpoint_lt_even))) {	// C1=10^33
2547 	      x1 = x1 - 1;	// x1 >= 0
2548 	      if (x1 >= 0) {
2549 		// clear position flags and tmp_inexact
2550 		is_midpoint_lt_even = 0;
2551 		is_midpoint_gt_even = 0;
2552 		is_inexact_lt_midpoint = 0;
2553 		is_inexact_gt_midpoint = 0;
2554 		tmp_inexact = 0;
2555 		second_pass = 1;
2556 		goto roundC2;	// else result has less than P34 digits
2557 	      }
2558 	    }
2559 	    // if the coefficient of the result is 10^34 it means that this
2560 	    // must be the second pass, and we are done
2561 	    if (C1.w[1] == 0x0001ed09bead87c0ull && C1.w[0] == 0x378d8e6400000000ull) {	// if  C1 = 10^34
2562 	      C1.w[1] = 0x0000314dc6448d93ull;	// C1 = 10^33
2563 	      C1.w[0] = 0x38c15b0a00000000ull;
2564 	      y_exp = y_exp + ((UINT64) 1 << 49);
2565 	    }
2566 	    x_sign = tmp_sign;
2567 	    if (x1 >= 1)
2568 	      y_exp = y_exp + ((UINT64) x1 << 49);
2569 	    // x1 = -1 is possible at the end of a second pass when the
2570 	    // first pass started with x1 = 1
2571 	  }
2572 	  C1_hi = C1.w[1];
2573 	  C1_lo = C1.w[0];
2574 	  // general correction from RN to RA, RM, RP, RZ; result uses y_exp
2575 	  if (rnd_mode != ROUNDING_TO_NEAREST) {
2576 	    if ((!x_sign
2577 		 && ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint)
2578 		     ||
2579 		     ((rnd_mode == ROUNDING_TIES_AWAY
2580 		       || rnd_mode == ROUNDING_UP)
2581 		      && is_midpoint_gt_even))) || (x_sign
2582 						    &&
2583 						    ((rnd_mode ==
2584 						      ROUNDING_DOWN
2585 						      &&
2586 						      is_inexact_lt_midpoint)
2587 						     ||
2588 						     ((rnd_mode ==
2589 						       ROUNDING_TIES_AWAY
2590 						       || rnd_mode ==
2591 						       ROUNDING_DOWN)
2592 						      &&
2593 						      is_midpoint_gt_even))))
2594 	    {
2595 	      // C1 = C1 + 1
2596 	      C1_lo = C1_lo + 1;
2597 	      if (C1_lo == 0) {	// rounding overflow in the low 64 bits
2598 		C1_hi = C1_hi + 1;
2599 	      }
2600 	      if (C1_hi == 0x0001ed09bead87c0ull
2601 		  && C1_lo == 0x378d8e6400000000ull) {
2602 		// C1 = 10^34 => rounding overflow
2603 		C1_hi = 0x0000314dc6448d93ull;
2604 		C1_lo = 0x38c15b0a00000000ull;	// 10^33
2605 		y_exp = y_exp + EXP_P1;
2606 	      }
2607 	    } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
2608 		       &&
2609 		       ((x_sign
2610 			 && (rnd_mode == ROUNDING_UP
2611 			     || rnd_mode == ROUNDING_TO_ZERO))
2612 			|| (!x_sign
2613 			    && (rnd_mode == ROUNDING_DOWN
2614 				|| rnd_mode == ROUNDING_TO_ZERO)))) {
2615 	      // C1 = C1 - 1
2616 	      C1_lo = C1_lo - 1;
2617 	      if (C1_lo == 0xffffffffffffffffull)
2618 		C1_hi--;
2619 	      // check if we crossed into the lower decade
2620 	      if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
2621 		C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
2622 		C1_lo = 0x378d8e63ffffffffull;
2623 		y_exp = y_exp - EXP_P1;
2624 		// no underflow, because delta + q2 >= P34 + 1
2625 	      }
2626 	    } else {
2627 	      ;	// exact, the result is already correct
2628 	    }
2629 	    // in all cases check for overflow (RN and RA solved already)
2630 	    if (y_exp == EXP_MAX_P1) {	// overflow
2631 	      if ((rnd_mode == ROUNDING_DOWN && x_sign) ||	// RM and res < 0
2632 		  (rnd_mode == ROUNDING_UP && !x_sign)) {	// RP and res > 0
2633 		C1_hi = 0x7800000000000000ull;	// +inf
2634 		C1_lo = 0x0ull;
2635 	      } else {	// RM and res > 0, RP and res < 0, or RZ
2636 		C1_hi = 0x5fffed09bead87c0ull;
2637 		C1_lo = 0x378d8e63ffffffffull;
2638 	      }
2639 	      y_exp = 0;	// x_sign is preserved
2640 	      // set the inexact flag (in case the exact addition was exact)
2641 	      *pfpsf |= INEXACT_EXCEPTION;
2642 	      // set the overflow flag
2643 	      *pfpsf |= OVERFLOW_EXCEPTION;
2644 	    }
2645 	  }
2646 	  // assemble the result
2647 	  res.w[1] = x_sign | y_exp | C1_hi;
2648 	  res.w[0] = C1_lo;
2649 	  if (tmp_inexact)
2650 	    *pfpsf |= INEXACT_EXCEPTION;
2651 	}
2652       } else {	// if (-P34 + 1 <= delta <= -1) <=> 1 <= -delta <= P34 - 1
2653 	// NOTE: the following, up to "} else { // if x_sign != y_sign
2654 	// the result is exact" is identical to "else if (delta == P34 - q2) {"
2655 	// from above; also, the code is not symmetric: a+b and b+a may take
2656 	// different paths (need to unify eventually!)
2657 	// calculate C' = C2 + C1 * 10^(e1-e2) directly; the result may be
2658 	// inexact if it requires P34 + 1 decimal digits; in either case the
2659 	// 'cutoff' point for addition is at the position of the lsb of C2
2660 	// The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
2661 	// exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
2662 	// but their product fits with certainty in 128 bits (actually in 113)
2663 	// Note that 0 <= e1 - e2 <= P34 - 2
2664 	//   -P34 + 1 <= delta <= -1 <=> -P34 + 1 <= delta <= -1 <=>
2665 	//   -P34 + 1 <= q1 + e1 - q2 - e2 <= -1 <=>
2666 	//   q2 - q1 - P34 + 1 <= e1 - e2 <= q2 - q1 - 1 <=>
2667 	//   1 - P34 - P34 + 1 <= e1-e2 <= P34 - 1 - 1 => 0 <= e1-e2 <= P34 - 2
2668 	scale = delta - q1 + q2;	// scale = (int)(e1 >> 49) - (int)(e2 >> 49)
2669 	if (scale >= 20) {	// 10^(e1-e2) does not fit in 64 bits, but C1 does
2670 	  __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
2671 	} else if (scale >= 1) {
2672 	  // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
2673 	  if (q1 <= 19) {	// C1 fits in 64 bits
2674 	    __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
2675 	  } else {	// q1 >= 20
2676 	    C1.w[1] = C1_hi;
2677 	    C1.w[0] = C1_lo;
2678 	    __mul_128x64_to_128 (C1, ten2k64[scale], C1);
2679 	  }
2680 	} else {	// if (scale == 0) C1 is unchanged
2681 	  C1.w[1] = C1_hi;
2682 	  C1.w[0] = C1_lo;	// only the low part is necessary
2683 	}
2684 	C1_hi = C1.w[1];
2685 	C1_lo = C1.w[0];
2686 	// now add C2
2687 	if (x_sign == y_sign) {
2688 	  // the result can overflow!
2689 	  C1_lo = C1_lo + C2_lo;
2690 	  C1_hi = C1_hi + C2_hi;
2691 	  if (C1_lo < C1.w[0])
2692 	    C1_hi++;
2693 	  // test for overflow, possible only when C1 >= 10^34
2694 	  if (C1_hi > 0x0001ed09bead87c0ull || (C1_hi == 0x0001ed09bead87c0ull && C1_lo >= 0x378d8e6400000000ull)) {	// C1 >= 10^34
2695 	    // in this case q = P34 + 1 and x = q - P34 = 1, so multiply
2696 	    // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1
2697 	    // decimal digits
2698 	    // Calculate C'' = C' + 1/2 * 10^x
2699 	    if (C1_lo >= 0xfffffffffffffffbull) {	// low half add has carry
2700 	      C1_lo = C1_lo + 5;
2701 	      C1_hi = C1_hi + 1;
2702 	    } else {
2703 	      C1_lo = C1_lo + 5;
2704 	    }
2705 	    // the approximation of 10^(-1) was rounded up to 118 bits
2706 	    // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
2707 	    // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
2708 	    C1.w[1] = C1_hi;
2709 	    C1.w[0] = C1_lo;	// C''
2710 	    ten2m1.w[1] = 0x1999999999999999ull;
2711 	    ten2m1.w[0] = 0x9999999999999a00ull;
2712 	    __mul_128x128_to_256 (P256, C1, ten2m1);	// P256 = C*, f*
2713 	    // C* is actually floor(C*) in this case
2714 	    // the top Ex = 128 bits of 10^(-1) are
2715 	    // T* = 0x00199999999999999999999999999999
2716 	    // if (0 < f* < 10^(-x)) then
2717 	    //   if floor(C*) is even then C = floor(C*) - logical right
2718 	    //       shift; C has p decimal digits, correct by Prop. 1)
2719 	    //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
2720 	    //       shift; C has p decimal digits, correct by Pr. 1)
2721 	    // else
2722 	    //   C = floor(C*) (logical right shift; C has p decimal digits,
2723 	    //       correct by Property 1)
2724 	    // n = C * 10^(e2+x)
2725 	    if ((P256.w[1] || P256.w[0])
2726 		&& (P256.w[1] < 0x1999999999999999ull
2727 		    || (P256.w[1] == 0x1999999999999999ull
2728 			&& P256.w[0] <= 0x9999999999999999ull))) {
2729 	      // the result is a midpoint
2730 	      if (P256.w[2] & 0x01) {
2731 		is_midpoint_gt_even = 1;
2732 		// if floor(C*) is odd C = floor(C*) - 1; the result is not 0
2733 		P256.w[2]--;
2734 		if (P256.w[2] == 0xffffffffffffffffull)
2735 		  P256.w[3]--;
2736 	      } else {
2737 		is_midpoint_lt_even = 1;
2738 	      }
2739 	    }
2740 	    // n = Cstar * 10^(e2+1)
2741 	    y_exp = y_exp + EXP_P1;
2742 	    // C* != 10^P34 because C* has P34 digits
2743 	    // check for overflow
2744 	    if (y_exp == EXP_MAX_P1
2745 		&& (rnd_mode == ROUNDING_TO_NEAREST
2746 		    || rnd_mode == ROUNDING_TIES_AWAY)) {
2747 	      // overflow for RN
2748 	      res.w[1] = x_sign | 0x7800000000000000ull;	// +/-inf
2749 	      res.w[0] = 0x0ull;
2750 	      // set the inexact flag
2751 	      *pfpsf |= INEXACT_EXCEPTION;
2752 	      // set the overflow flag
2753 	      *pfpsf |= OVERFLOW_EXCEPTION;
2754 	      BID_SWAP128 (res);
2755 	      BID_RETURN (res);
2756 	    }
2757 	    // if (0 < f* - 1/2 < 10^(-x)) then
2758 	    //   the result of the addition is exact
2759 	    // else
2760 	    //   the result of the addition is inexact
2761 	    if (P256.w[1] > 0x8000000000000000ull || (P256.w[1] == 0x8000000000000000ull && P256.w[0] > 0x0ull)) {	// the result may be exact
2762 	      tmp64 = P256.w[1] - 0x8000000000000000ull;	// f* - 1/2
2763 	      if ((tmp64 > 0x1999999999999999ull
2764 		   || (tmp64 == 0x1999999999999999ull
2765 		       && P256.w[0] >= 0x9999999999999999ull))) {
2766 		// set the inexact flag
2767 		*pfpsf |= INEXACT_EXCEPTION;
2768 		is_inexact = 1;
2769 	      }	// else the result is exact
2770 	    } else {	// the result is inexact
2771 	      // set the inexact flag
2772 	      *pfpsf |= INEXACT_EXCEPTION;
2773 	      is_inexact = 1;
2774 	    }
2775 	    C1_hi = P256.w[3];
2776 	    C1_lo = P256.w[2];
2777 	    if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
2778 	      is_inexact_lt_midpoint = is_inexact
2779 		&& (P256.w[1] & 0x8000000000000000ull);
2780 	      is_inexact_gt_midpoint = is_inexact
2781 		&& !(P256.w[1] & 0x8000000000000000ull);
2782 	    }
2783 	    // general correction from RN to RA, RM, RP, RZ; result uses y_exp
2784 	    if (rnd_mode != ROUNDING_TO_NEAREST) {
2785 	      if ((!x_sign
2786 		   && ((rnd_mode == ROUNDING_UP
2787 			&& is_inexact_lt_midpoint)
2788 		       || ((rnd_mode == ROUNDING_TIES_AWAY
2789 			    || rnd_mode == ROUNDING_UP)
2790 			   && is_midpoint_gt_even))) || (x_sign
2791 							 &&
2792 							 ((rnd_mode ==
2793 							   ROUNDING_DOWN
2794 							   &&
2795 							   is_inexact_lt_midpoint)
2796 							  ||
2797 							  ((rnd_mode ==
2798 							    ROUNDING_TIES_AWAY
2799 							    || rnd_mode
2800 							    ==
2801 							    ROUNDING_DOWN)
2802 							   &&
2803 							   is_midpoint_gt_even))))
2804 	      {
2805 		// C1 = C1 + 1
2806 		C1_lo = C1_lo + 1;
2807 		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
2808 		  C1_hi = C1_hi + 1;
2809 		}
2810 		if (C1_hi == 0x0001ed09bead87c0ull
2811 		    && C1_lo == 0x378d8e6400000000ull) {
2812 		  // C1 = 10^34 => rounding overflow
2813 		  C1_hi = 0x0000314dc6448d93ull;
2814 		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
2815 		  y_exp = y_exp + EXP_P1;
2816 		}
2817 	      } else
2818 		if ((is_midpoint_lt_even || is_inexact_gt_midpoint) &&
2819 		    ((x_sign && (rnd_mode == ROUNDING_UP ||
2820 				 rnd_mode == ROUNDING_TO_ZERO)) ||
2821 		     (!x_sign && (rnd_mode == ROUNDING_DOWN ||
2822 				  rnd_mode == ROUNDING_TO_ZERO)))) {
2823 		// C1 = C1 - 1
2824 		C1_lo = C1_lo - 1;
2825 		if (C1_lo == 0xffffffffffffffffull)
2826 		  C1_hi--;
2827 		// check if we crossed into the lower decade
2828 		if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
2829 		  C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
2830 		  C1_lo = 0x378d8e63ffffffffull;
2831 		  y_exp = y_exp - EXP_P1;
2832 		  // no underflow, because delta + q2 >= P34 + 1
2833 		}
2834 	      } else {
2835 		;	// exact, the result is already correct
2836 	      }
2837 	      // in all cases check for overflow (RN and RA solved already)
2838 	      if (y_exp == EXP_MAX_P1) {	// overflow
2839 		if ((rnd_mode == ROUNDING_DOWN && x_sign) ||	// RM and res < 0
2840 		    (rnd_mode == ROUNDING_UP && !x_sign)) {	// RP and res > 0
2841 		  C1_hi = 0x7800000000000000ull;	// +inf
2842 		  C1_lo = 0x0ull;
2843 		} else {	// RM and res > 0, RP and res < 0, or RZ
2844 		  C1_hi = 0x5fffed09bead87c0ull;
2845 		  C1_lo = 0x378d8e63ffffffffull;
2846 		}
2847 		y_exp = 0;	// x_sign is preserved
2848 		// set the inexact flag (in case the exact addition was exact)
2849 		*pfpsf |= INEXACT_EXCEPTION;
2850 		// set the overflow flag
2851 		*pfpsf |= OVERFLOW_EXCEPTION;
2852 	      }
2853 	    }
2854 	  }	// else if (C1 < 10^34) then C1 is the coeff.; the result is exact
2855 	  // assemble the result
2856 	  res.w[1] = x_sign | y_exp | C1_hi;
2857 	  res.w[0] = C1_lo;
2858 	} else {	// if x_sign != y_sign the result is exact
2859 	  C1_lo = C2_lo - C1_lo;
2860 	  C1_hi = C2_hi - C1_hi;
2861 	  if (C1_lo > C2_lo)
2862 	    C1_hi--;
2863 	  if (C1_hi >= 0x8000000000000000ull) {	// negative coefficient!
2864 	    C1_lo = ~C1_lo;
2865 	    C1_lo++;
2866 	    C1_hi = ~C1_hi;
2867 	    if (C1_lo == 0x0)
2868 	      C1_hi++;
2869 	    x_sign = y_sign;	// the result will have the sign of y
2870 	  }
2871 	  // the result can be zero, but it cannot overflow
2872 	  if (C1_lo == 0 && C1_hi == 0) {
2873 	    // assemble the result
2874 	    if (x_exp < y_exp)
2875 	      res.w[1] = x_exp;
2876 	    else
2877 	      res.w[1] = y_exp;
2878 	    res.w[0] = 0;
2879 	    if (rnd_mode == ROUNDING_DOWN) {
2880 	      res.w[1] |= 0x8000000000000000ull;
2881 	    }
2882 	    BID_SWAP128 (res);
2883 	    BID_RETURN (res);
2884 	  }
2885 	  // assemble the result
2886 	  res.w[1] = y_sign | y_exp | C1_hi;
2887 	  res.w[0] = C1_lo;
2888 	}
2889       }
2890     }
2891     BID_SWAP128 (res);
2892     BID_RETURN (res)
2893   }
2894 }
2895 
2896 
2897 
2898 // bid128_sub stands for bid128qq_sub
2899 
2900 /*****************************************************************************
2901  *  BID128 sub
2902  ****************************************************************************/
2903 
2904 #if DECIMAL_CALL_BY_REFERENCE
2905 void
2906 bid128_sub (UINT128 * pres, UINT128 * px, UINT128 * py
2907 	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
2908 	    _EXC_INFO_PARAM) {
2909   UINT128 x = *px, y = *py;
2910 #if !DECIMAL_GLOBAL_ROUNDING
2911   unsigned int rnd_mode = *prnd_mode;
2912 #endif
2913 #else
2914 UINT128
2915 bid128_sub (UINT128 x, UINT128 y
2916 	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
2917 	    _EXC_INFO_PARAM) {
2918 #endif
2919 
2920   UINT128 res;
2921   UINT64 y_sign;
2922 
2923   if ((y.w[HIGH_128W] & MASK_NAN) != MASK_NAN) {	// y is not NAN
2924     // change its sign
2925     y_sign = y.w[HIGH_128W] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
2926     if (y_sign)
2927       y.w[HIGH_128W] = y.w[HIGH_128W] & 0x7fffffffffffffffull;
2928     else
2929       y.w[HIGH_128W] = y.w[HIGH_128W] | 0x8000000000000000ull;
2930   }
2931 #if DECIMAL_CALL_BY_REFERENCE
2932   bid128_add (&res, &x, &y
2933 	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
2934 	      _EXC_INFO_ARG);
2935 #else
2936   res = bid128_add (x, y
2937 		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
2938 		    _EXC_INFO_ARG);
2939 #endif
2940   BID_RETURN (res);
2941 }
2942