1/* 2 * Copyright (c) 2014 Advanced Micro Devices, Inc. 3 * 4 * Copyright (c) 2017 Michal Babej / Tampere University of Technology 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25#define bitalign(hi, lo, shift) \ 26 ((hi) << ((itype)32 - (shift))) | ((lo) >> (shift)); 27 28_CL_OVERLOADABLE void __pocl_fullMulS(vtype *hi, vtype *lo, vtype a, vtype b, vtype bh, vtype bt) 29{ 30 if (HAVE_FMA32) { 31 vtype ph = a * b; 32 *hi = ph; 33 *lo = fma(a, b, -ph); 34 } else { 35 vtype ah = as_vtype(as_utype(a) & (utype)0xfffff000U); 36 vtype at = a - ah; 37 vtype ph = a * b; 38 vtype pt = pocl_fma(at, bt, pocl_fma(at, bh, pocl_fma(ah, bt, pocl_fma(ah, bh, -ph)))); 39 *hi = ph; 40 *lo = pt; 41 } 42} 43 44_CL_OVERLOADABLE vtype __pocl_removePi2S(vtype *hi, vtype *lo, vtype x) 45{ 46 // 72 bits of pi/2 47 const vtype fpiby2_1 = (vtype)( 0xC90FDA / 0x1.0p+23f); 48 const vtype fpiby2_1_h = (vtype)( 0xC90 / 0x1.0p+11f); 49 const vtype fpiby2_1_t = (vtype)( 0xFDA / 0x1.0p+23f); 50 51 const vtype fpiby2_2 = (vtype)( 0xA22168 / 0x1.0p+47f); 52 const vtype fpiby2_2_h = (vtype)( 0xA22 / 0x1.0p+35f); 53 const vtype fpiby2_2_t = (vtype)( 0x168 / 0x1.0p+47f); 54 55 const vtype fpiby2_3 = (vtype)( 0xC234C4 / 0x1.0p+71f); 56 const vtype fpiby2_3_h = (vtype)( 0xC23 / 0x1.0p+59f); 57 const vtype fpiby2_3_t = (vtype)( 0x4C4 / 0x1.0p+71f); 58 59 const vtype twobypi = (vtype)0x1.45f306p-1f; 60 61 vtype fnpi2 = trunc(pocl_fma(x, twobypi, (vtype)0.5f)); 62 63 // subtract n * pi/2 from x 64 vtype rhead, rtail; 65 __pocl_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t); 66 vtype v = x - rhead; 67 vtype rem = v + (((x - v) - rhead) - rtail); 68 69 vtype rhead2, rtail2; 70 __pocl_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t); 71 v = rem - rhead2; 72 rem = v + (((rem - v) - rhead2) - rtail2); 73 74 vtype rhead3, rtail3; 75 __pocl_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t); 76 v = rem - rhead3; 77 78 *hi = v + ((rem - v) - rhead3); 79 *lo = -rtail3; 80 return fnpi2; 81} 82 83_CL_OVERLOADABLE itype __pocl_argReductionSmallS(vtype *r, vtype *rr, vtype x) 84{ 85 vtype fnpi2 = __pocl_removePi2S(r, rr, x); 86 return convert_itype(fnpi2) & (itype)0x3; 87} 88 89#define FULL_MUL(A, B, HI, LO) \ 90 LO = A * B; \ 91 HI = mul_hi(A, B) 92 93#define FULL_MAD(A, B, C, HI, LO) \ 94 LO = ((A) * (B) + (C)); \ 95 HI = mul_hi(A, B); \ 96 HI += ((LO < C) ? (utype)1 : (utype)0) 97 98#ifdef SINGLEVEC 99#define SHIFT_MINUS_32 shift -= c << 5 100#else 101#define SHIFT_MINUS_32 shift -= c & (itype)32 102#endif 103 104_CL_OVERLOADABLE itype __pocl_argReductionLargeS(vtype *r, vtype *rr, vtype x) 105{ 106 itype xe = (itype)(as_itype(x) >> 23) - (itype)127; 107 utype xm = (utype)0x00800000U | (as_utype(x) & (utype)0x7fffffU); 108 109 // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041 FE5163AB 110 const utype b6 = (utype)0xA2F9836EU; 111 const utype b5 = (utype)0x4E441529U; 112 const utype b4 = (utype)0xFC2757D1U; 113 const utype b3 = (utype)0xF534DDC0U; 114 const utype b2 = (utype)0xDB629599U; 115 const utype b1 = (utype)0x3C439041U; 116 const utype b0 = (utype)0xFE5163ABU; 117 118 utype p0, p1, p2, p3, p4, p5, p6, p7, c0, c1; 119 120 FULL_MUL(xm, b0, c0, p0); 121 FULL_MAD(xm, b1, c0, c1, p1); 122 FULL_MAD(xm, b2, c1, c0, p2); 123 FULL_MAD(xm, b3, c0, c1, p3); 124 FULL_MAD(xm, b4, c1, c0, p4); 125 FULL_MAD(xm, b5, c0, c1, p5); 126 FULL_MAD(xm, b6, c1, p7, p6); 127 128 itype fbits = (itype)224 + (itype)23 - xe; 129 130 // shift amount to get 2 lsb of integer part at top 2 bits 131 // min: 25 (xe=18) max: 134 (xe=127) 132 itype shift = (itype)254 - fbits; 133 134 // Shift by up to 134/32 = 4 words 135 itype c = (shift > 31); 136 p7 = c ? p6 : p7; 137 p6 = c ? p5 : p6; 138 p5 = c ? p4 : p5; 139 p4 = c ? p3 : p4; 140 p3 = c ? p2 : p3; 141 p2 = c ? p1 : p2; 142 p1 = c ? p0 : p1; 143 SHIFT_MINUS_32; 144 145 c = (shift > 31); 146 p7 = c ? p6 : p7; 147 p6 = c ? p5 : p6; 148 p5 = c ? p4 : p5; 149 p4 = c ? p3 : p4; 150 p3 = c ? p2 : p3; 151 p2 = c ? p1 : p2; 152 SHIFT_MINUS_32; 153 154 c = (shift > 31); 155 p7 = c ? p6 : p7; 156 p6 = c ? p5 : p6; 157 p5 = c ? p4 : p5; 158 p4 = c ? p3 : p4; 159 p3 = c ? p2 : p3; 160 SHIFT_MINUS_32; 161 162 c = (shift > 31); 163 p7 = c ? p6 : p7; 164 p6 = c ? p5 : p6; 165 p5 = c ? p4 : p5; 166 p4 = c ? p3 : p4; 167 SHIFT_MINUS_32; 168 169 // bitalign cannot handle a shift of 32 170 c = (shift > 0); 171 shift = (itype)32 - shift; 172 utype t7 = bitalign(p7, p6, shift); 173 utype t6 = bitalign(p6, p5, shift); 174 utype t5 = bitalign(p5, p4, shift); 175 p7 = c ? t7 : p7; 176 p6 = c ? t6 : p6; 177 p5 = c ? t5 : p5; 178 179 // Get 2 lsb of itype part and msb of fraction 180 itype i = as_itype(p7 >> 29); 181 182 // Scoot up 2 more bits so only fraction remains 183 p7 = bitalign(p7, p6, 30); 184 p6 = bitalign(p6, p5, 30); 185 p5 = bitalign(p5, p4, 30); 186 187 // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5 188 utype flip = (i << 31) ? (utype)0xffffffffU : (utype)0U; 189 utype sign = (i << 31) ? (utype)0x80000000U : (utype)0U; 190 p7 = p7 ^ flip; 191 p6 = p6 ^ flip; 192 p5 = p5 ^ flip; 193 194 // Find exponent and shift away leading zeroes and hidden bit 195 xe = as_itype(clz(p7)) + (itype)1; 196 shift = (itype)32 - xe; 197 p7 = bitalign(p7, p6, shift); 198 p6 = bitalign(p6, p5, shift); 199 200 // Most significant part of fraction 201 vtype q1 = as_vtype(as_itype(sign) | (((itype)127 - xe) << 23) | as_itype(p7 >> 9)); 202 203 // Shift out bits we captured on q1 204 p7 = bitalign(p7, p6, 32-23); 205 206 // Get 24 more bits of fraction in another vtype, there are not long strings of zeroes here 207 itype xxe = as_itype(clz(p7)) + (itype)1; 208 p7 = bitalign(p7, p6, (itype)32 - xxe); 209 vtype q0 = as_vtype(as_itype(sign) | (((itype)127 - (xe + (itype)23 + xxe)) << 23) | as_itype(p7 >> 9)); 210 211 // At this point, the fraction q1 + q0 is correct to at least 48 bits 212 // Now we need to multiply the fraction by pi/2 213 // This loses us about 4 bits 214 // pi/2 = C90 FDA A22 168 C23 4C4 215 216 const vtype pio2h = (vtype)(0xc90fda / 0x1.0p+23f); 217 const vtype pio2hh = (vtype)(0xc90 / 0x1.0p+11f); 218 const vtype pio2ht = (vtype)(0xfda / 0x1.0p+23f); 219 const vtype pio2t = (vtype)(0xa22168 / 0x1.0p+47f); 220 221 vtype rh, rt; 222 223 if (HAVE_FMA32) { 224 rh = q1 * pio2h; 225 rt = pocl_fma(q0, pio2h, 226 pocl_fma(q1, pio2t, 227 pocl_fma(q1, pio2h, -rh))); 228 } else { 229 vtype q1h = as_vtype(as_utype(q1) & (utype)0xfffff000); 230 vtype q1t = q1 - q1h; 231 rh = q1 * pio2h; 232 rt = pocl_fma(q1t, pio2ht, 233 pocl_fma(q1t, pio2hh, 234 pocl_fma(q1h, pio2ht, pocl_fma(q1h, pio2hh, -rh)))); 235 rt = pocl_fma(q0, pio2h, pocl_fma(q1, pio2t, rt)); 236 } 237 238 vtype t = rh + rt; 239 rt = rt - (t - rh); 240 241 *r = t; 242 *rr = rt; 243 return ((i >> 1) + (i & (itype)1)) & (itype)0x3; 244} 245 246#undef SHIFT_MINUS_32 247 248_CL_OVERLOADABLE itype __pocl_argReductionS(vtype *r, vtype *rr, vtype x) 249{ 250 itype retval = __pocl_argReductionSmallS(r, rr, x); 251 itype cond = (x >= (vtype)0x1.0p+23f); 252 if (SV_ANY(cond)) { 253 retval = __pocl_argReductionLargeS(r, rr, x); 254 } 255 return retval; 256} 257 258 259 260// Evaluate single precisions in and cos of value in interval [-pi/4, pi/4] 261_CL_OVERLOADABLE v2type __pocl_sincosf_piby4(vtype x) 262{ 263 // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... 264 // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... 265 // = x * f(w) 266 // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... 267 // We use a minimax approximation of (f(w) - 1) / w 268 // because this produces an expansion in even powers of x. 269 270 // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... 271 // = f(w) 272 // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... 273 // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) 274 // because this produces an expansion in even powers of x. 275 276 const vtype sc1 = (vtype)-0.166666666638608441788607926e0F; 277 const vtype sc2 = (vtype)0.833333187633086262120839299e-2F; 278 const vtype sc3 = (vtype)-0.198400874359527693921333720e-3F; 279 const vtype sc4 = (vtype)0.272500015145584081596826911e-5F; 280 281 const vtype cc1 = (vtype)0.41666666664325175238031e-1F; 282 const vtype cc2 = (vtype)-0.13888887673175665567647e-2F; 283 const vtype cc3 = (vtype)0.24800600878112441958053e-4F; 284 const vtype cc4 = (vtype)-0.27301013343179832472841e-6F; 285 286 vtype x2 = x * x; 287 288 v2type ret; 289 ret.lo = pocl_fma(x*x2, 290 pocl_fma(x2, 291 pocl_fma(x2, 292 pocl_fma(x2, sc4, sc3), 293 sc2), 294 sc1), 295 x); 296 ret.hi = pocl_fma(x2*x2, 297 pocl_fma(x2, 298 pocl_fma(x2, 299 pocl_fma(x2, cc4, cc3), 300 cc2), 301 cc1), 302 pocl_fma(x2, (vtype)(-0.5f), (vtype)1.0f)); 303 return ret; 304} 305 306 307_CL_OVERLOADABLE vtype __pocl_cosf_piby4(vtype x, vtype y) { 308 // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... 309 // = f(w) 310 // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... 311 // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) 312 // because this produces an expansion in even powers of x. 313 314 const vtype c1 = (vtype)0.416666666e-1f; 315 const vtype c2 = (vtype)-0.138888876e-2f; 316 const vtype c3 = (vtype)0.248006008e-4f; 317 const vtype c4 = (vtype)-0.2730101334e-6f; 318 const vtype c5 = (vtype)2.0875723372e-09f; // 0x310f74f6 319 const vtype c6 = (vtype)-1.1359647598e-11f; // 0xad47d74e 320 321 vtype z = x * x; 322 vtype r = z * pocl_fma(z, 323 pocl_fma(z, 324 pocl_fma(z, 325 pocl_fma(z, 326 pocl_fma(z, c6, c5), 327 c4), 328 c3), 329 c2), 330 c1); 331 332 // if |x| < 0.3 333 vtype qx = (vtype)0.0f; 334 335 itype ix = as_itype(x) & (itype)EXSIGNBIT_SP32; 336 337 // 0.78125 > |x| >= 0.3 338 vtype xby4 = as_vtype(ix - (itype)0x01000000); 339 qx = ((ix >= (itype)0x3e99999a) & (ix <= (itype)0x3f480000)) ? xby4 : qx; 340 341 // x > 0.78125 342 qx = (ix > (itype)0x3f480000) ? (vtype)0.28125f : qx; 343 344 vtype hz = pocl_fma(z, (vtype)0.5f, -qx); 345 vtype a = (vtype)1.0f - qx; 346 vtype ret = a - (hz - pocl_fma(z, r, -x*y)); 347 return ret; 348} 349 350 351_CL_OVERLOADABLE vtype __pocl_sinf_piby4(vtype x, vtype y) { 352 // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... 353 // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... 354 // = x * f(w) 355 // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... 356 // We use a minimax approximation of (f(w) - 1) / w 357 // because this produces an expansion in even powers of x. 358 359 const vtype c1 = (vtype)-0.1666666666e0f; 360 const vtype c2 = (vtype)0.8333331876e-2f; 361 const vtype c3 = (vtype)-0.198400874e-3f; 362 const vtype c4 = (vtype)0.272500015e-5f; 363 const vtype c5 = (vtype)-2.5050759689e-08f; // 0xb2d72f34 364 const vtype c6 = (vtype)1.5896910177e-10f; // 0x2f2ec9d3 365 366 vtype z = x * x; 367 vtype v = z * x; 368 vtype r = pocl_fma(z, 369 pocl_fma(z, 370 pocl_fma(z, 371 pocl_fma(z, c6, c5), 372 c4), 373 c3), 374 c2); 375 vtype ret = x - pocl_fma(v, -c1, 376 pocl_fma(z, 377 pocl_fma(y, (vtype)0.5f, -v*r), -y)); 378 379 return ret; 380} 381