1/*
2 * Copyright (c) 2014 Advanced Micro Devices, Inc.
3 *
4 * Copyright (c) 2017 Michal Babej / Tampere University of Technology
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#define bitalign(hi, lo, shift) \
26  ((hi) << ((itype)32 - (shift))) | ((lo) >> (shift));
27
28_CL_OVERLOADABLE void __pocl_fullMulS(vtype *hi, vtype *lo, vtype a, vtype b, vtype bh, vtype bt)
29{
30    if (HAVE_FMA32) {
31        vtype ph = a * b;
32        *hi = ph;
33        *lo = fma(a, b, -ph);
34    } else {
35        vtype ah = as_vtype(as_utype(a) & (utype)0xfffff000U);
36        vtype at = a - ah;
37        vtype ph = a * b;
38        vtype pt = pocl_fma(at, bt, pocl_fma(at, bh, pocl_fma(ah, bt, pocl_fma(ah, bh, -ph))));
39        *hi = ph;
40        *lo = pt;
41    }
42}
43
44_CL_OVERLOADABLE vtype __pocl_removePi2S(vtype *hi, vtype *lo, vtype x)
45{
46    // 72 bits of pi/2
47    const vtype fpiby2_1 = (vtype)( 0xC90FDA / 0x1.0p+23f);
48    const vtype fpiby2_1_h = (vtype)( 0xC90 / 0x1.0p+11f);
49    const vtype fpiby2_1_t = (vtype)( 0xFDA / 0x1.0p+23f);
50
51    const vtype fpiby2_2 = (vtype)( 0xA22168 / 0x1.0p+47f);
52    const vtype fpiby2_2_h = (vtype)( 0xA22 / 0x1.0p+35f);
53    const vtype fpiby2_2_t = (vtype)( 0x168 / 0x1.0p+47f);
54
55    const vtype fpiby2_3 = (vtype)( 0xC234C4 / 0x1.0p+71f);
56    const vtype fpiby2_3_h = (vtype)( 0xC23 / 0x1.0p+59f);
57    const vtype fpiby2_3_t = (vtype)( 0x4C4 / 0x1.0p+71f);
58
59    const vtype twobypi = (vtype)0x1.45f306p-1f;
60
61    vtype fnpi2 = trunc(pocl_fma(x, twobypi, (vtype)0.5f));
62
63    // subtract n * pi/2 from x
64    vtype rhead, rtail;
65    __pocl_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
66    vtype v = x - rhead;
67    vtype rem = v + (((x - v) - rhead) - rtail);
68
69    vtype rhead2, rtail2;
70    __pocl_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
71    v = rem - rhead2;
72    rem = v + (((rem - v) - rhead2) - rtail2);
73
74    vtype rhead3, rtail3;
75    __pocl_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
76    v = rem - rhead3;
77
78    *hi = v + ((rem - v) - rhead3);
79    *lo = -rtail3;
80    return fnpi2;
81}
82
83_CL_OVERLOADABLE itype __pocl_argReductionSmallS(vtype *r, vtype *rr, vtype x)
84{
85    vtype fnpi2 = __pocl_removePi2S(r, rr, x);
86    return convert_itype(fnpi2) & (itype)0x3;
87}
88
89#define FULL_MUL(A, B, HI, LO) \
90    LO = A * B; \
91    HI = mul_hi(A, B)
92
93#define FULL_MAD(A, B, C, HI, LO) \
94    LO = ((A) * (B) + (C)); \
95    HI = mul_hi(A, B); \
96    HI += ((LO < C) ? (utype)1 : (utype)0)
97
98#ifdef SINGLEVEC
99#define SHIFT_MINUS_32 shift -= c << 5
100#else
101#define SHIFT_MINUS_32 shift -= c & (itype)32
102#endif
103
104_CL_OVERLOADABLE itype __pocl_argReductionLargeS(vtype *r, vtype *rr, vtype x)
105{
106    itype xe = (itype)(as_itype(x) >> 23) - (itype)127;
107    utype xm = (utype)0x00800000U | (as_utype(x) & (utype)0x7fffffU);
108
109    // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041 FE5163AB
110    const utype b6 = (utype)0xA2F9836EU;
111    const utype b5 = (utype)0x4E441529U;
112    const utype b4 = (utype)0xFC2757D1U;
113    const utype b3 = (utype)0xF534DDC0U;
114    const utype b2 = (utype)0xDB629599U;
115    const utype b1 = (utype)0x3C439041U;
116    const utype b0 = (utype)0xFE5163ABU;
117
118    utype p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
119
120    FULL_MUL(xm, b0, c0, p0);
121    FULL_MAD(xm, b1, c0, c1, p1);
122    FULL_MAD(xm, b2, c1, c0, p2);
123    FULL_MAD(xm, b3, c0, c1, p3);
124    FULL_MAD(xm, b4, c1, c0, p4);
125    FULL_MAD(xm, b5, c0, c1, p5);
126    FULL_MAD(xm, b6, c1, p7, p6);
127
128    itype fbits = (itype)224 + (itype)23 - xe;
129
130    // shift amount to get 2 lsb of integer part at top 2 bits
131    //   min: 25 (xe=18) max: 134 (xe=127)
132    itype shift = (itype)254 - fbits;
133
134    // Shift by up to 134/32 = 4 words
135    itype c = (shift > 31);
136    p7 = c ? p6 : p7;
137    p6 = c ? p5 : p6;
138    p5 = c ? p4 : p5;
139    p4 = c ? p3 : p4;
140    p3 = c ? p2 : p3;
141    p2 = c ? p1 : p2;
142    p1 = c ? p0 : p1;
143    SHIFT_MINUS_32;
144
145    c = (shift > 31);
146    p7 = c ? p6 : p7;
147    p6 = c ? p5 : p6;
148    p5 = c ? p4 : p5;
149    p4 = c ? p3 : p4;
150    p3 = c ? p2 : p3;
151    p2 = c ? p1 : p2;
152    SHIFT_MINUS_32;
153
154    c = (shift > 31);
155    p7 = c ? p6 : p7;
156    p6 = c ? p5 : p6;
157    p5 = c ? p4 : p5;
158    p4 = c ? p3 : p4;
159    p3 = c ? p2 : p3;
160    SHIFT_MINUS_32;
161
162    c = (shift > 31);
163    p7 = c ? p6 : p7;
164    p6 = c ? p5 : p6;
165    p5 = c ? p4 : p5;
166    p4 = c ? p3 : p4;
167    SHIFT_MINUS_32;
168
169    // bitalign cannot handle a shift of 32
170    c = (shift > 0);
171    shift = (itype)32 - shift;
172    utype t7 = bitalign(p7, p6, shift);
173    utype t6 = bitalign(p6, p5, shift);
174    utype t5 = bitalign(p5, p4, shift);
175    p7 = c ? t7 : p7;
176    p6 = c ? t6 : p6;
177    p5 = c ? t5 : p5;
178
179    // Get 2 lsb of itype part and msb of fraction
180    itype i = as_itype(p7 >> 29);
181
182    // Scoot up 2 more bits so only fraction remains
183    p7 = bitalign(p7, p6, 30);
184    p6 = bitalign(p6, p5, 30);
185    p5 = bitalign(p5, p4, 30);
186
187    // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
188    utype flip = (i << 31) ? (utype)0xffffffffU : (utype)0U;
189    utype sign = (i << 31) ? (utype)0x80000000U : (utype)0U;
190    p7 = p7 ^ flip;
191    p6 = p6 ^ flip;
192    p5 = p5 ^ flip;
193
194    // Find exponent and shift away leading zeroes and hidden bit
195    xe = as_itype(clz(p7)) + (itype)1;
196    shift = (itype)32 - xe;
197    p7 = bitalign(p7, p6, shift);
198    p6 = bitalign(p6, p5, shift);
199
200    // Most significant part of fraction
201    vtype q1 = as_vtype(as_itype(sign) | (((itype)127 - xe) << 23) | as_itype(p7 >> 9));
202
203    // Shift out bits we captured on q1
204    p7 = bitalign(p7, p6, 32-23);
205
206    // Get 24 more bits of fraction in another vtype, there are not long strings of zeroes here
207    itype xxe = as_itype(clz(p7)) + (itype)1;
208    p7 = bitalign(p7, p6, (itype)32 - xxe);
209    vtype q0 = as_vtype(as_itype(sign) | (((itype)127 - (xe + (itype)23 + xxe)) << 23) | as_itype(p7 >> 9));
210
211    // At this point, the fraction q1 + q0 is correct to at least 48 bits
212    // Now we need to multiply the fraction by pi/2
213    // This loses us about 4 bits
214    // pi/2 = C90 FDA A22 168 C23 4C4
215
216    const vtype pio2h = (vtype)(0xc90fda / 0x1.0p+23f);
217    const vtype pio2hh = (vtype)(0xc90 / 0x1.0p+11f);
218    const vtype pio2ht = (vtype)(0xfda / 0x1.0p+23f);
219    const vtype pio2t = (vtype)(0xa22168 / 0x1.0p+47f);
220
221    vtype rh, rt;
222
223    if (HAVE_FMA32) {
224        rh = q1 * pio2h;
225        rt = pocl_fma(q0, pio2h,
226               pocl_fma(q1, pio2t,
227                 pocl_fma(q1, pio2h, -rh)));
228    } else {
229        vtype q1h = as_vtype(as_utype(q1) & (utype)0xfffff000);
230        vtype q1t = q1 - q1h;
231        rh = q1 * pio2h;
232        rt = pocl_fma(q1t, pio2ht,
233               pocl_fma(q1t, pio2hh,
234                 pocl_fma(q1h, pio2ht, pocl_fma(q1h, pio2hh, -rh))));
235        rt = pocl_fma(q0, pio2h, pocl_fma(q1, pio2t, rt));
236    }
237
238    vtype t = rh + rt;
239    rt = rt - (t - rh);
240
241    *r = t;
242    *rr = rt;
243    return ((i >> 1) + (i & (itype)1)) & (itype)0x3;
244}
245
246#undef SHIFT_MINUS_32
247
248_CL_OVERLOADABLE itype __pocl_argReductionS(vtype *r, vtype *rr, vtype x)
249{
250    itype retval = __pocl_argReductionSmallS(r, rr, x);
251    itype cond = (x >= (vtype)0x1.0p+23f);
252    if (SV_ANY(cond)) {
253        retval = __pocl_argReductionLargeS(r, rr, x);
254    }
255    return retval;
256}
257
258
259
260// Evaluate single precisions in and cos of value in interval [-pi/4, pi/4]
261_CL_OVERLOADABLE v2type __pocl_sincosf_piby4(vtype x)
262{
263    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
264    // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
265    // = x * f(w)
266    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
267    // We use a minimax approximation of (f(w) - 1) / w
268    // because this produces an expansion in even powers of x.
269
270    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
271    // = f(w)
272    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
273    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
274    // because this produces an expansion in even powers of x.
275
276    const vtype sc1 = (vtype)-0.166666666638608441788607926e0F;
277    const vtype sc2 = (vtype)0.833333187633086262120839299e-2F;
278    const vtype sc3 = (vtype)-0.198400874359527693921333720e-3F;
279    const vtype sc4 = (vtype)0.272500015145584081596826911e-5F;
280
281    const vtype cc1 = (vtype)0.41666666664325175238031e-1F;
282    const vtype cc2 = (vtype)-0.13888887673175665567647e-2F;
283    const vtype cc3 = (vtype)0.24800600878112441958053e-4F;
284    const vtype cc4 = (vtype)-0.27301013343179832472841e-6F;
285
286    vtype x2 = x * x;
287
288    v2type ret;
289    ret.lo = pocl_fma(x*x2,
290               pocl_fma(x2,
291                 pocl_fma(x2,
292                   pocl_fma(x2, sc4, sc3),
293                   sc2),
294                 sc1),
295               x);
296    ret.hi = pocl_fma(x2*x2,
297               pocl_fma(x2,
298                 pocl_fma(x2,
299                   pocl_fma(x2, cc4, cc3),
300                   cc2),
301                 cc1),
302                 pocl_fma(x2, (vtype)(-0.5f), (vtype)1.0f));
303    return ret;
304}
305
306
307_CL_OVERLOADABLE vtype __pocl_cosf_piby4(vtype x, vtype y) {
308    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
309    // = f(w)
310    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
311    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
312    // because this produces an expansion in even powers of x.
313
314    const vtype c1 = (vtype)0.416666666e-1f;
315    const vtype c2 = (vtype)-0.138888876e-2f;
316    const vtype c3 = (vtype)0.248006008e-4f;
317    const vtype c4 = (vtype)-0.2730101334e-6f;
318    const vtype c5 = (vtype)2.0875723372e-09f;  // 0x310f74f6
319    const vtype c6 = (vtype)-1.1359647598e-11f; // 0xad47d74e
320
321    vtype z = x * x;
322    vtype r = z * pocl_fma(z,
323                    pocl_fma(z,
324                      pocl_fma(z,
325                        pocl_fma(z,
326                          pocl_fma(z, c6,  c5),
327                          c4),
328                        c3),
329                      c2),
330                    c1);
331
332    // if |x| < 0.3
333    vtype qx = (vtype)0.0f;
334
335    itype ix = as_itype(x) & (itype)EXSIGNBIT_SP32;
336
337    //  0.78125 > |x| >= 0.3
338    vtype xby4 = as_vtype(ix - (itype)0x01000000);
339    qx = ((ix >= (itype)0x3e99999a) & (ix <= (itype)0x3f480000)) ? xby4 : qx;
340
341    // x > 0.78125
342    qx = (ix > (itype)0x3f480000) ? (vtype)0.28125f : qx;
343
344    vtype hz = pocl_fma(z, (vtype)0.5f, -qx);
345    vtype a = (vtype)1.0f - qx;
346    vtype ret = a - (hz - pocl_fma(z, r, -x*y));
347    return ret;
348}
349
350
351_CL_OVERLOADABLE vtype __pocl_sinf_piby4(vtype x, vtype y) {
352    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
353    // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
354    // = x * f(w)
355    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
356    // We use a minimax approximation of (f(w) - 1) / w
357    // because this produces an expansion in even powers of x.
358
359    const vtype c1 = (vtype)-0.1666666666e0f;
360    const vtype c2 = (vtype)0.8333331876e-2f;
361    const vtype c3 = (vtype)-0.198400874e-3f;
362    const vtype c4 = (vtype)0.272500015e-5f;
363    const vtype c5 = (vtype)-2.5050759689e-08f; // 0xb2d72f34
364    const vtype c6 = (vtype)1.5896910177e-10f;  // 0x2f2ec9d3
365
366    vtype z = x * x;
367    vtype v = z * x;
368    vtype r = pocl_fma(z,
369                pocl_fma(z,
370                  pocl_fma(z,
371                    pocl_fma(z, c6, c5),
372                    c4),
373                  c3),
374                c2);
375    vtype ret = x - pocl_fma(v, -c1,
376                      pocl_fma(z,
377                        pocl_fma(y, (vtype)0.5f, -v*r), -y));
378
379    return ret;
380}
381