1 /* Software floating-point emulation. 2 Basic two-word fraction declaration and manipulation. 3 Copyright (C) 1997-2016 Free Software Foundation, Inc. 4 This file is part of the GNU C Library. 5 Contributed by Richard Henderson (rth@cygnus.com), 6 Jakub Jelinek (jj@ultra.linux.cz), 7 David S. Miller (davem@redhat.com) and 8 Peter Maydell (pmaydell@chiark.greenend.org.uk). 9 10 The GNU C Library is free software; you can redistribute it and/or 11 modify it under the terms of the GNU Lesser General Public 12 License as published by the Free Software Foundation; either 13 version 2.1 of the License, or (at your option) any later version. 14 15 In addition to the permissions in the GNU Lesser General Public 16 License, the Free Software Foundation gives you unlimited 17 permission to link the compiled version of this file into 18 combinations with other programs, and to distribute those 19 combinations without any restriction coming from the use of this 20 file. (The Lesser General Public License restrictions do apply in 21 other respects; for example, they cover modification of the file, 22 and distribution when not linked into a combine executable.) 23 24 The GNU C Library is distributed in the hope that it will be useful, 25 but WITHOUT ANY WARRANTY; without even the implied warranty of 26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 27 Lesser General Public License for more details. 28 29 You should have received a copy of the GNU Lesser General Public 30 License along with the GNU C Library; if not, see 31 <http://www.gnu.org/licenses/>. */ 32 33 #ifndef SOFT_FP_OP_2_H 34 #define SOFT_FP_OP_2_H 1 35 36 #define _FP_FRAC_DECL_2(X) \ 37 _FP_W_TYPE X##_f0 _FP_ZERO_INIT, X##_f1 _FP_ZERO_INIT 38 #define _FP_FRAC_COPY_2(D, S) (D##_f0 = S##_f0, D##_f1 = S##_f1) 39 #define _FP_FRAC_SET_2(X, I) __FP_FRAC_SET_2 (X, I) 40 #define _FP_FRAC_HIGH_2(X) (X##_f1) 41 #define _FP_FRAC_LOW_2(X) (X##_f0) 42 #define _FP_FRAC_WORD_2(X, w) (X##_f##w) 43 44 #define _FP_FRAC_SLL_2(X, N) \ 45 (void) (((N) < _FP_W_TYPE_SIZE) \ 46 ? ({ \ 47 if (__builtin_constant_p (N) && (N) == 1) \ 48 { \ 49 X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE) (X##_f0)) < 0); \ 50 X##_f0 += X##_f0; \ 51 } \ 52 else \ 53 { \ 54 X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \ 55 X##_f0 <<= (N); \ 56 } \ 57 0; \ 58 }) \ 59 : ({ \ 60 X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE); \ 61 X##_f0 = 0; \ 62 })) 63 64 65 #define _FP_FRAC_SRL_2(X, N) \ 66 (void) (((N) < _FP_W_TYPE_SIZE) \ 67 ? ({ \ 68 X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N)); \ 69 X##_f1 >>= (N); \ 70 }) \ 71 : ({ \ 72 X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE); \ 73 X##_f1 = 0; \ 74 })) 75 76 /* Right shift with sticky-lsb. */ 77 #define _FP_FRAC_SRST_2(X, S, N, sz) \ 78 (void) (((N) < _FP_W_TYPE_SIZE) \ 79 ? ({ \ 80 S = (__builtin_constant_p (N) && (N) == 1 \ 81 ? X##_f0 & 1 \ 82 : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0); \ 83 X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N)); \ 84 X##_f1 >>= (N); \ 85 }) \ 86 : ({ \ 87 S = ((((N) == _FP_W_TYPE_SIZE \ 88 ? 0 \ 89 : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N)))) \ 90 | X##_f0) != 0); \ 91 X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE)); \ 92 X##_f1 = 0; \ 93 })) 94 95 #define _FP_FRAC_SRS_2(X, N, sz) \ 96 (void) (((N) < _FP_W_TYPE_SIZE) \ 97 ? ({ \ 98 X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) \ 99 | (__builtin_constant_p (N) && (N) == 1 \ 100 ? X##_f0 & 1 \ 101 : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0)); \ 102 X##_f1 >>= (N); \ 103 }) \ 104 : ({ \ 105 X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) \ 106 | ((((N) == _FP_W_TYPE_SIZE \ 107 ? 0 \ 108 : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N)))) \ 109 | X##_f0) != 0)); \ 110 X##_f1 = 0; \ 111 })) 112 113 #define _FP_FRAC_ADDI_2(X, I) \ 114 __FP_FRAC_ADDI_2 (X##_f1, X##_f0, I) 115 116 #define _FP_FRAC_ADD_2(R, X, Y) \ 117 __FP_FRAC_ADD_2 (R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0) 118 119 #define _FP_FRAC_SUB_2(R, X, Y) \ 120 __FP_FRAC_SUB_2 (R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0) 121 122 #define _FP_FRAC_DEC_2(X, Y) \ 123 __FP_FRAC_DEC_2 (X##_f1, X##_f0, Y##_f1, Y##_f0) 124 125 #define _FP_FRAC_CLZ_2(R, X) \ 126 do \ 127 { \ 128 if (X##_f1) \ 129 __FP_CLZ ((R), X##_f1); \ 130 else \ 131 { \ 132 __FP_CLZ ((R), X##_f0); \ 133 (R) += _FP_W_TYPE_SIZE; \ 134 } \ 135 } \ 136 while (0) 137 138 /* Predicates. */ 139 #define _FP_FRAC_NEGP_2(X) ((_FP_WS_TYPE) X##_f1 < 0) 140 #define _FP_FRAC_ZEROP_2(X) ((X##_f1 | X##_f0) == 0) 141 #define _FP_FRAC_OVERP_2(fs, X) (_FP_FRAC_HIGH_##fs (X) & _FP_OVERFLOW_##fs) 142 #define _FP_FRAC_CLEAR_OVERP_2(fs, X) (_FP_FRAC_HIGH_##fs (X) &= ~_FP_OVERFLOW_##fs) 143 #define _FP_FRAC_HIGHBIT_DW_2(fs, X) \ 144 (_FP_FRAC_HIGH_DW_##fs (X) & _FP_HIGHBIT_DW_##fs) 145 #define _FP_FRAC_EQ_2(X, Y) (X##_f1 == Y##_f1 && X##_f0 == Y##_f0) 146 #define _FP_FRAC_GT_2(X, Y) \ 147 (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0)) 148 #define _FP_FRAC_GE_2(X, Y) \ 149 (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0)) 150 151 #define _FP_ZEROFRAC_2 0, 0 152 #define _FP_MINFRAC_2 0, 1 153 #define _FP_MAXFRAC_2 (~(_FP_WS_TYPE) 0), (~(_FP_WS_TYPE) 0) 154 155 /* Internals. */ 156 157 #define __FP_FRAC_SET_2(X, I1, I0) (X##_f0 = I0, X##_f1 = I1) 158 159 #define __FP_CLZ_2(R, xh, xl) \ 160 do \ 161 { \ 162 if (xh) \ 163 __FP_CLZ ((R), xh); \ 164 else \ 165 { \ 166 __FP_CLZ ((R), xl); \ 167 (R) += _FP_W_TYPE_SIZE; \ 168 } \ 169 } \ 170 while (0) 171 172 #if 0 173 174 # ifndef __FP_FRAC_ADDI_2 175 # define __FP_FRAC_ADDI_2(xh, xl, i) \ 176 (xh += ((xl += i) < i)) 177 # endif 178 # ifndef __FP_FRAC_ADD_2 179 # define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl) \ 180 (rh = xh + yh + ((rl = xl + yl) < xl)) 181 # endif 182 # ifndef __FP_FRAC_SUB_2 183 # define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl) \ 184 (rh = xh - yh - ((rl = xl - yl) > xl)) 185 # endif 186 # ifndef __FP_FRAC_DEC_2 187 # define __FP_FRAC_DEC_2(xh, xl, yh, yl) \ 188 do \ 189 { \ 190 UWtype __FP_FRAC_DEC_2_t = xl; \ 191 xh -= yh + ((xl -= yl) > __FP_FRAC_DEC_2_t); \ 192 } \ 193 while (0) 194 # endif 195 196 #else 197 198 # undef __FP_FRAC_ADDI_2 199 # define __FP_FRAC_ADDI_2(xh, xl, i) add_ssaaaa (xh, xl, xh, xl, 0, i) 200 # undef __FP_FRAC_ADD_2 201 # define __FP_FRAC_ADD_2 add_ssaaaa 202 # undef __FP_FRAC_SUB_2 203 # define __FP_FRAC_SUB_2 sub_ddmmss 204 # undef __FP_FRAC_DEC_2 205 # define __FP_FRAC_DEC_2(xh, xl, yh, yl) \ 206 sub_ddmmss (xh, xl, xh, xl, yh, yl) 207 208 #endif 209 210 /* Unpack the raw bits of a native fp value. Do not classify or 211 normalize the data. */ 212 213 #define _FP_UNPACK_RAW_2(fs, X, val) \ 214 do \ 215 { \ 216 union _FP_UNION_##fs _FP_UNPACK_RAW_2_flo; \ 217 _FP_UNPACK_RAW_2_flo.flt = (val); \ 218 \ 219 X##_f0 = _FP_UNPACK_RAW_2_flo.bits.frac0; \ 220 X##_f1 = _FP_UNPACK_RAW_2_flo.bits.frac1; \ 221 X##_e = _FP_UNPACK_RAW_2_flo.bits.exp; \ 222 X##_s = _FP_UNPACK_RAW_2_flo.bits.sign; \ 223 } \ 224 while (0) 225 226 #define _FP_UNPACK_RAW_2_P(fs, X, val) \ 227 do \ 228 { \ 229 union _FP_UNION_##fs *_FP_UNPACK_RAW_2_P_flo \ 230 = (union _FP_UNION_##fs *) (val); \ 231 \ 232 X##_f0 = _FP_UNPACK_RAW_2_P_flo->bits.frac0; \ 233 X##_f1 = _FP_UNPACK_RAW_2_P_flo->bits.frac1; \ 234 X##_e = _FP_UNPACK_RAW_2_P_flo->bits.exp; \ 235 X##_s = _FP_UNPACK_RAW_2_P_flo->bits.sign; \ 236 } \ 237 while (0) 238 239 240 /* Repack the raw bits of a native fp value. */ 241 242 #define _FP_PACK_RAW_2(fs, val, X) \ 243 do \ 244 { \ 245 union _FP_UNION_##fs _FP_PACK_RAW_2_flo; \ 246 \ 247 _FP_PACK_RAW_2_flo.bits.frac0 = X##_f0; \ 248 _FP_PACK_RAW_2_flo.bits.frac1 = X##_f1; \ 249 _FP_PACK_RAW_2_flo.bits.exp = X##_e; \ 250 _FP_PACK_RAW_2_flo.bits.sign = X##_s; \ 251 \ 252 (val) = _FP_PACK_RAW_2_flo.flt; \ 253 } \ 254 while (0) 255 256 #define _FP_PACK_RAW_2_P(fs, val, X) \ 257 do \ 258 { \ 259 union _FP_UNION_##fs *_FP_PACK_RAW_2_P_flo \ 260 = (union _FP_UNION_##fs *) (val); \ 261 \ 262 _FP_PACK_RAW_2_P_flo->bits.frac0 = X##_f0; \ 263 _FP_PACK_RAW_2_P_flo->bits.frac1 = X##_f1; \ 264 _FP_PACK_RAW_2_P_flo->bits.exp = X##_e; \ 265 _FP_PACK_RAW_2_P_flo->bits.sign = X##_s; \ 266 } \ 267 while (0) 268 269 270 /* Multiplication algorithms: */ 271 272 /* Given a 1W * 1W => 2W primitive, do the extended multiplication. */ 273 274 #define _FP_MUL_MEAT_DW_2_wide(wfracbits, R, X, Y, doit) \ 275 do \ 276 { \ 277 _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_b); \ 278 _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_c); \ 279 \ 280 doit (_FP_FRAC_WORD_4 (R, 1), _FP_FRAC_WORD_4 (R, 0), \ 281 X##_f0, Y##_f0); \ 282 doit (_FP_MUL_MEAT_DW_2_wide_b_f1, _FP_MUL_MEAT_DW_2_wide_b_f0, \ 283 X##_f0, Y##_f1); \ 284 doit (_FP_MUL_MEAT_DW_2_wide_c_f1, _FP_MUL_MEAT_DW_2_wide_c_f0, \ 285 X##_f1, Y##_f0); \ 286 doit (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \ 287 X##_f1, Y##_f1); \ 288 \ 289 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \ 290 _FP_FRAC_WORD_4 (R, 1), 0, \ 291 _FP_MUL_MEAT_DW_2_wide_b_f1, \ 292 _FP_MUL_MEAT_DW_2_wide_b_f0, \ 293 _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \ 294 _FP_FRAC_WORD_4 (R, 1)); \ 295 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \ 296 _FP_FRAC_WORD_4 (R, 1), 0, \ 297 _FP_MUL_MEAT_DW_2_wide_c_f1, \ 298 _FP_MUL_MEAT_DW_2_wide_c_f0, \ 299 _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \ 300 _FP_FRAC_WORD_4 (R, 1)); \ 301 } \ 302 while (0) 303 304 #define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit) \ 305 do \ 306 { \ 307 _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_wide_z); \ 308 \ 309 _FP_MUL_MEAT_DW_2_wide ((wfracbits), _FP_MUL_MEAT_2_wide_z, \ 310 X, Y, doit); \ 311 \ 312 /* Normalize since we know where the msb of the multiplicands \ 313 were (bit B), we know that the msb of the of the product is \ 314 at either 2B or 2B-1. */ \ 315 _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_wide_z, (wfracbits)-1, \ 316 2*(wfracbits)); \ 317 R##_f0 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_z, 0); \ 318 R##_f1 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_z, 1); \ 319 } \ 320 while (0) 321 322 /* Given a 1W * 1W => 2W primitive, do the extended multiplication. 323 Do only 3 multiplications instead of four. This one is for machines 324 where multiplication is much more expensive than subtraction. */ 325 326 #define _FP_MUL_MEAT_DW_2_wide_3mul(wfracbits, R, X, Y, doit) \ 327 do \ 328 { \ 329 _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_3mul_b); \ 330 _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_3mul_c); \ 331 _FP_W_TYPE _FP_MUL_MEAT_DW_2_wide_3mul_d; \ 332 int _FP_MUL_MEAT_DW_2_wide_3mul_c1; \ 333 int _FP_MUL_MEAT_DW_2_wide_3mul_c2; \ 334 \ 335 _FP_MUL_MEAT_DW_2_wide_3mul_b_f0 = X##_f0 + X##_f1; \ 336 _FP_MUL_MEAT_DW_2_wide_3mul_c1 \ 337 = _FP_MUL_MEAT_DW_2_wide_3mul_b_f0 < X##_f0; \ 338 _FP_MUL_MEAT_DW_2_wide_3mul_b_f1 = Y##_f0 + Y##_f1; \ 339 _FP_MUL_MEAT_DW_2_wide_3mul_c2 \ 340 = _FP_MUL_MEAT_DW_2_wide_3mul_b_f1 < Y##_f0; \ 341 doit (_FP_MUL_MEAT_DW_2_wide_3mul_d, _FP_FRAC_WORD_4 (R, 0), \ 342 X##_f0, Y##_f0); \ 343 doit (_FP_FRAC_WORD_4 (R, 2), _FP_FRAC_WORD_4 (R, 1), \ 344 _FP_MUL_MEAT_DW_2_wide_3mul_b_f0, \ 345 _FP_MUL_MEAT_DW_2_wide_3mul_b_f1); \ 346 doit (_FP_MUL_MEAT_DW_2_wide_3mul_c_f1, \ 347 _FP_MUL_MEAT_DW_2_wide_3mul_c_f0, X##_f1, Y##_f1); \ 348 \ 349 _FP_MUL_MEAT_DW_2_wide_3mul_b_f0 \ 350 &= -_FP_MUL_MEAT_DW_2_wide_3mul_c2; \ 351 _FP_MUL_MEAT_DW_2_wide_3mul_b_f1 \ 352 &= -_FP_MUL_MEAT_DW_2_wide_3mul_c1; \ 353 __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \ 354 _FP_FRAC_WORD_4 (R, 1), \ 355 (_FP_MUL_MEAT_DW_2_wide_3mul_c1 \ 356 & _FP_MUL_MEAT_DW_2_wide_3mul_c2), 0, \ 357 _FP_MUL_MEAT_DW_2_wide_3mul_d, \ 358 0, _FP_FRAC_WORD_4 (R, 2), _FP_FRAC_WORD_4 (R, 1)); \ 359 __FP_FRAC_ADDI_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \ 360 _FP_MUL_MEAT_DW_2_wide_3mul_b_f0); \ 361 __FP_FRAC_ADDI_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \ 362 _FP_MUL_MEAT_DW_2_wide_3mul_b_f1); \ 363 __FP_FRAC_DEC_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \ 364 _FP_FRAC_WORD_4 (R, 1), \ 365 0, _FP_MUL_MEAT_DW_2_wide_3mul_d, \ 366 _FP_FRAC_WORD_4 (R, 0)); \ 367 __FP_FRAC_DEC_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \ 368 _FP_FRAC_WORD_4 (R, 1), 0, \ 369 _FP_MUL_MEAT_DW_2_wide_3mul_c_f1, \ 370 _FP_MUL_MEAT_DW_2_wide_3mul_c_f0); \ 371 __FP_FRAC_ADD_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \ 372 _FP_MUL_MEAT_DW_2_wide_3mul_c_f1, \ 373 _FP_MUL_MEAT_DW_2_wide_3mul_c_f0, \ 374 _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2)); \ 375 } \ 376 while (0) 377 378 #define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit) \ 379 do \ 380 { \ 381 _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_wide_3mul_z); \ 382 \ 383 _FP_MUL_MEAT_DW_2_wide_3mul ((wfracbits), \ 384 _FP_MUL_MEAT_2_wide_3mul_z, \ 385 X, Y, doit); \ 386 \ 387 /* Normalize since we know where the msb of the multiplicands \ 388 were (bit B), we know that the msb of the of the product is \ 389 at either 2B or 2B-1. */ \ 390 _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_wide_3mul_z, \ 391 (wfracbits)-1, 2*(wfracbits)); \ 392 R##_f0 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_3mul_z, 0); \ 393 R##_f1 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_3mul_z, 1); \ 394 } \ 395 while (0) 396 397 #define _FP_MUL_MEAT_DW_2_gmp(wfracbits, R, X, Y) \ 398 do \ 399 { \ 400 _FP_W_TYPE _FP_MUL_MEAT_DW_2_gmp_x[2]; \ 401 _FP_W_TYPE _FP_MUL_MEAT_DW_2_gmp_y[2]; \ 402 _FP_MUL_MEAT_DW_2_gmp_x[0] = X##_f0; \ 403 _FP_MUL_MEAT_DW_2_gmp_x[1] = X##_f1; \ 404 _FP_MUL_MEAT_DW_2_gmp_y[0] = Y##_f0; \ 405 _FP_MUL_MEAT_DW_2_gmp_y[1] = Y##_f1; \ 406 \ 407 mpn_mul_n (R##_f, _FP_MUL_MEAT_DW_2_gmp_x, \ 408 _FP_MUL_MEAT_DW_2_gmp_y, 2); \ 409 } \ 410 while (0) 411 412 #define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y) \ 413 do \ 414 { \ 415 _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_gmp_z); \ 416 \ 417 _FP_MUL_MEAT_DW_2_gmp ((wfracbits), _FP_MUL_MEAT_2_gmp_z, X, Y); \ 418 \ 419 /* Normalize since we know where the msb of the multiplicands \ 420 were (bit B), we know that the msb of the of the product is \ 421 at either 2B or 2B-1. */ \ 422 _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_gmp_z, (wfracbits)-1, \ 423 2*(wfracbits)); \ 424 R##_f0 = _FP_MUL_MEAT_2_gmp_z_f[0]; \ 425 R##_f1 = _FP_MUL_MEAT_2_gmp_z_f[1]; \ 426 } \ 427 while (0) 428 429 /* Do at most 120x120=240 bits multiplication using double floating 430 point multiplication. This is useful if floating point 431 multiplication has much bigger throughput than integer multiply. 432 It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits 433 between 106 and 120 only. 434 Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set. 435 SETFETZ is a macro which will disable all FPU exceptions and set rounding 436 towards zero, RESETFE should optionally reset it back. */ 437 438 #define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe) \ 439 do \ 440 { \ 441 static const double _const[] = \ 442 { \ 443 /* 2^-24 */ 5.9604644775390625e-08, \ 444 /* 2^-48 */ 3.5527136788005009e-15, \ 445 /* 2^-72 */ 2.1175823681357508e-22, \ 446 /* 2^-96 */ 1.2621774483536189e-29, \ 447 /* 2^28 */ 2.68435456e+08, \ 448 /* 2^4 */ 1.600000e+01, \ 449 /* 2^-20 */ 9.5367431640625e-07, \ 450 /* 2^-44 */ 5.6843418860808015e-14, \ 451 /* 2^-68 */ 3.3881317890172014e-21, \ 452 /* 2^-92 */ 2.0194839173657902e-28, \ 453 /* 2^-116 */ 1.2037062152420224e-35 \ 454 }; \ 455 double _a240, _b240, _c240, _d240, _e240, _f240, \ 456 _g240, _h240, _i240, _j240, _k240; \ 457 union { double d; UDItype i; } _l240, _m240, _n240, _o240, \ 458 _p240, _q240, _r240, _s240; \ 459 UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0; \ 460 \ 461 _FP_STATIC_ASSERT ((wfracbits) >= 106 && (wfracbits) <= 120, \ 462 "wfracbits out of range"); \ 463 \ 464 setfetz; \ 465 \ 466 _e240 = (double) (long) (X##_f0 & 0xffffff); \ 467 _j240 = (double) (long) (Y##_f0 & 0xffffff); \ 468 _d240 = (double) (long) ((X##_f0 >> 24) & 0xffffff); \ 469 _i240 = (double) (long) ((Y##_f0 >> 24) & 0xffffff); \ 470 _c240 = (double) (long) (((X##_f1 << 16) & 0xffffff) | (X##_f0 >> 48)); \ 471 _h240 = (double) (long) (((Y##_f1 << 16) & 0xffffff) | (Y##_f0 >> 48)); \ 472 _b240 = (double) (long) ((X##_f1 >> 8) & 0xffffff); \ 473 _g240 = (double) (long) ((Y##_f1 >> 8) & 0xffffff); \ 474 _a240 = (double) (long) (X##_f1 >> 32); \ 475 _f240 = (double) (long) (Y##_f1 >> 32); \ 476 _e240 *= _const[3]; \ 477 _j240 *= _const[3]; \ 478 _d240 *= _const[2]; \ 479 _i240 *= _const[2]; \ 480 _c240 *= _const[1]; \ 481 _h240 *= _const[1]; \ 482 _b240 *= _const[0]; \ 483 _g240 *= _const[0]; \ 484 _s240.d = _e240*_j240; \ 485 _r240.d = _d240*_j240 + _e240*_i240; \ 486 _q240.d = _c240*_j240 + _d240*_i240 + _e240*_h240; \ 487 _p240.d = _b240*_j240 + _c240*_i240 + _d240*_h240 + _e240*_g240; \ 488 _o240.d = _a240*_j240 + _b240*_i240 + _c240*_h240 + _d240*_g240 + _e240*_f240; \ 489 _n240.d = _a240*_i240 + _b240*_h240 + _c240*_g240 + _d240*_f240; \ 490 _m240.d = _a240*_h240 + _b240*_g240 + _c240*_f240; \ 491 _l240.d = _a240*_g240 + _b240*_f240; \ 492 _k240 = _a240*_f240; \ 493 _r240.d += _s240.d; \ 494 _q240.d += _r240.d; \ 495 _p240.d += _q240.d; \ 496 _o240.d += _p240.d; \ 497 _n240.d += _o240.d; \ 498 _m240.d += _n240.d; \ 499 _l240.d += _m240.d; \ 500 _k240 += _l240.d; \ 501 _s240.d -= ((_const[10]+_s240.d)-_const[10]); \ 502 _r240.d -= ((_const[9]+_r240.d)-_const[9]); \ 503 _q240.d -= ((_const[8]+_q240.d)-_const[8]); \ 504 _p240.d -= ((_const[7]+_p240.d)-_const[7]); \ 505 _o240.d += _const[7]; \ 506 _n240.d += _const[6]; \ 507 _m240.d += _const[5]; \ 508 _l240.d += _const[4]; \ 509 if (_s240.d != 0.0) \ 510 _y240 = 1; \ 511 if (_r240.d != 0.0) \ 512 _y240 = 1; \ 513 if (_q240.d != 0.0) \ 514 _y240 = 1; \ 515 if (_p240.d != 0.0) \ 516 _y240 = 1; \ 517 _t240 = (DItype) _k240; \ 518 _u240 = _l240.i; \ 519 _v240 = _m240.i; \ 520 _w240 = _n240.i; \ 521 _x240 = _o240.i; \ 522 R##_f1 = ((_t240 << (128 - (wfracbits - 1))) \ 523 | ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104))); \ 524 R##_f0 = (((_u240 & 0xffffff) << (168 - (wfracbits - 1))) \ 525 | ((_v240 & 0xffffff) << (144 - (wfracbits - 1))) \ 526 | ((_w240 & 0xffffff) << (120 - (wfracbits - 1))) \ 527 | ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96)) \ 528 | _y240); \ 529 resetfe; \ 530 } \ 531 while (0) 532 533 /* Division algorithms: */ 534 535 #define _FP_DIV_MEAT_2_udiv(fs, R, X, Y) \ 536 do \ 537 { \ 538 _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f2; \ 539 _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f1; \ 540 _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f0; \ 541 _FP_W_TYPE _FP_DIV_MEAT_2_udiv_r_f1; \ 542 _FP_W_TYPE _FP_DIV_MEAT_2_udiv_r_f0; \ 543 _FP_W_TYPE _FP_DIV_MEAT_2_udiv_m_f1; \ 544 _FP_W_TYPE _FP_DIV_MEAT_2_udiv_m_f0; \ 545 if (_FP_FRAC_GE_2 (X, Y)) \ 546 { \ 547 _FP_DIV_MEAT_2_udiv_n_f2 = X##_f1 >> 1; \ 548 _FP_DIV_MEAT_2_udiv_n_f1 \ 549 = X##_f1 << (_FP_W_TYPE_SIZE - 1) | X##_f0 >> 1; \ 550 _FP_DIV_MEAT_2_udiv_n_f0 \ 551 = X##_f0 << (_FP_W_TYPE_SIZE - 1); \ 552 } \ 553 else \ 554 { \ 555 R##_e--; \ 556 _FP_DIV_MEAT_2_udiv_n_f2 = X##_f1; \ 557 _FP_DIV_MEAT_2_udiv_n_f1 = X##_f0; \ 558 _FP_DIV_MEAT_2_udiv_n_f0 = 0; \ 559 } \ 560 \ 561 /* Normalize, i.e. make the most significant bit of the \ 562 denominator set. */ \ 563 _FP_FRAC_SLL_2 (Y, _FP_WFRACXBITS_##fs); \ 564 \ 565 udiv_qrnnd (R##_f1, _FP_DIV_MEAT_2_udiv_r_f1, \ 566 _FP_DIV_MEAT_2_udiv_n_f2, _FP_DIV_MEAT_2_udiv_n_f1, \ 567 Y##_f1); \ 568 umul_ppmm (_FP_DIV_MEAT_2_udiv_m_f1, _FP_DIV_MEAT_2_udiv_m_f0, \ 569 R##_f1, Y##_f0); \ 570 _FP_DIV_MEAT_2_udiv_r_f0 = _FP_DIV_MEAT_2_udiv_n_f0; \ 571 if (_FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m, _FP_DIV_MEAT_2_udiv_r)) \ 572 { \ 573 R##_f1--; \ 574 _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y, \ 575 _FP_DIV_MEAT_2_udiv_r); \ 576 if (_FP_FRAC_GE_2 (_FP_DIV_MEAT_2_udiv_r, Y) \ 577 && _FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m, \ 578 _FP_DIV_MEAT_2_udiv_r)) \ 579 { \ 580 R##_f1--; \ 581 _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y, \ 582 _FP_DIV_MEAT_2_udiv_r); \ 583 } \ 584 } \ 585 _FP_FRAC_DEC_2 (_FP_DIV_MEAT_2_udiv_r, _FP_DIV_MEAT_2_udiv_m); \ 586 \ 587 if (_FP_DIV_MEAT_2_udiv_r_f1 == Y##_f1) \ 588 { \ 589 /* This is a special case, not an optimization \ 590 (_FP_DIV_MEAT_2_udiv_r/Y##_f1 would not fit into UWtype). \ 591 As _FP_DIV_MEAT_2_udiv_r is guaranteed to be < Y, \ 592 R##_f0 can be either (UWtype)-1 or (UWtype)-2. But as we \ 593 know what kind of bits it is (sticky, guard, round), \ 594 we don't care. We also don't care what the reminder is, \ 595 because the guard bit will be set anyway. -jj */ \ 596 R##_f0 = -1; \ 597 } \ 598 else \ 599 { \ 600 udiv_qrnnd (R##_f0, _FP_DIV_MEAT_2_udiv_r_f1, \ 601 _FP_DIV_MEAT_2_udiv_r_f1, \ 602 _FP_DIV_MEAT_2_udiv_r_f0, Y##_f1); \ 603 umul_ppmm (_FP_DIV_MEAT_2_udiv_m_f1, \ 604 _FP_DIV_MEAT_2_udiv_m_f0, R##_f0, Y##_f0); \ 605 _FP_DIV_MEAT_2_udiv_r_f0 = 0; \ 606 if (_FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m, \ 607 _FP_DIV_MEAT_2_udiv_r)) \ 608 { \ 609 R##_f0--; \ 610 _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y, \ 611 _FP_DIV_MEAT_2_udiv_r); \ 612 if (_FP_FRAC_GE_2 (_FP_DIV_MEAT_2_udiv_r, Y) \ 613 && _FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m, \ 614 _FP_DIV_MEAT_2_udiv_r)) \ 615 { \ 616 R##_f0--; \ 617 _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y, \ 618 _FP_DIV_MEAT_2_udiv_r); \ 619 } \ 620 } \ 621 if (!_FP_FRAC_EQ_2 (_FP_DIV_MEAT_2_udiv_r, \ 622 _FP_DIV_MEAT_2_udiv_m)) \ 623 R##_f0 |= _FP_WORK_STICKY; \ 624 } \ 625 } \ 626 while (0) 627 628 629 /* Square root algorithms: 630 We have just one right now, maybe Newton approximation 631 should be added for those machines where division is fast. */ 632 633 #define _FP_SQRT_MEAT_2(R, S, T, X, q) \ 634 do \ 635 { \ 636 while (q) \ 637 { \ 638 T##_f1 = S##_f1 + (q); \ 639 if (T##_f1 <= X##_f1) \ 640 { \ 641 S##_f1 = T##_f1 + (q); \ 642 X##_f1 -= T##_f1; \ 643 R##_f1 += (q); \ 644 } \ 645 _FP_FRAC_SLL_2 (X, 1); \ 646 (q) >>= 1; \ 647 } \ 648 (q) = (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE - 1); \ 649 while ((q) != _FP_WORK_ROUND) \ 650 { \ 651 T##_f0 = S##_f0 + (q); \ 652 T##_f1 = S##_f1; \ 653 if (T##_f1 < X##_f1 \ 654 || (T##_f1 == X##_f1 && T##_f0 <= X##_f0)) \ 655 { \ 656 S##_f0 = T##_f0 + (q); \ 657 S##_f1 += (T##_f0 > S##_f0); \ 658 _FP_FRAC_DEC_2 (X, T); \ 659 R##_f0 += (q); \ 660 } \ 661 _FP_FRAC_SLL_2 (X, 1); \ 662 (q) >>= 1; \ 663 } \ 664 if (X##_f0 | X##_f1) \ 665 { \ 666 if (S##_f1 < X##_f1 \ 667 || (S##_f1 == X##_f1 && S##_f0 < X##_f0)) \ 668 R##_f0 |= _FP_WORK_ROUND; \ 669 R##_f0 |= _FP_WORK_STICKY; \ 670 } \ 671 } \ 672 while (0) 673 674 675 /* Assembly/disassembly for converting to/from integral types. 676 No shifting or overflow handled here. */ 677 678 #define _FP_FRAC_ASSEMBLE_2(r, X, rsize) \ 679 (void) (((rsize) <= _FP_W_TYPE_SIZE) \ 680 ? ({ (r) = X##_f0; }) \ 681 : ({ \ 682 (r) = X##_f1; \ 683 (r) <<= _FP_W_TYPE_SIZE; \ 684 (r) += X##_f0; \ 685 })) 686 687 #define _FP_FRAC_DISASSEMBLE_2(X, r, rsize) \ 688 do \ 689 { \ 690 X##_f0 = (r); \ 691 X##_f1 = ((rsize) <= _FP_W_TYPE_SIZE \ 692 ? 0 \ 693 : (r) >> _FP_W_TYPE_SIZE); \ 694 } \ 695 while (0) 696 697 /* Convert FP values between word sizes. */ 698 699 #define _FP_FRAC_COPY_1_2(D, S) (D##_f = S##_f0) 700 701 #define _FP_FRAC_COPY_2_1(D, S) ((D##_f0 = S##_f), (D##_f1 = 0)) 702 703 #define _FP_FRAC_COPY_2_2(D, S) _FP_FRAC_COPY_2 (D, S) 704 705 #endif /* !SOFT_FP_OP_2_H */ 706