1 /*
2  * Copyright (c) 2012-2018, NVIDIA CORPORATION.  All rights reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */
17 
18 #define	CONCAT_(l,r)	l##r
19 #define	CONCAT(l,r)	CONCAT_(l,r)
20 
21 #ifdef TABLE_TARGET
22 
23 /*  #include <math.h>  */
24 
25 /* Table elements/constants for exp... */
26 
27 static const double table[] = {1.0,
28                                1.0218971486541166,
29                                1.0442737824274138,
30                                1.0671404006768237,
31                                1.0905077326652577,
32                                1.1143867425958924,
33                                1.1387886347566916,
34                                1.1637248587775775,
35                                1.1892071150027210,
36                                1.2152473599804690,
37                                1.2418578120734840,
38                                1.2690509571917332,
39                                1.2968395546510096,
40                                1.3252366431597413,
41                                1.3542555469368927,
42                                1.3839098819638320,
43                                1.4142135623730951,
44                                1.4451808069770467,
45                                1.4768261459394993,
46                                1.5091644275934228,
47                                1.5422108254079407,
48                                1.5759808451078865,
49                                1.6104903319492543,
50                                1.6457554781539650,
51                                1.6817928305074290,
52                                1.7186192981224780,
53                                1.7562521603732995,
54                                1.7947090750031072,
55                                1.8340080864093424,
56                                1.8741676341103000,
57                                1.9152065613971474,
58                                1.9571441241754002,
59                                2.0};
60 
61 static const double invl2x32 = 46.166241308446829;     // 32.0/ln(2.0)
62 static const double ln2xinv32 = 0.0216608493924982909; // ln(2.0)/32.0
63 static const double inv32 = 3.125e-2;                  // 1.0/32.0
64 static const double ln2 = 0.693147180559945309;        // ln(2.0)
65 static const double invln2 = 1.44269504088896341;      // 1.0/ln(2.0)
66 static const double a1 = 5.0000000000000000e-1;        // coefficient for x^2
67 static const double a2 = 1.6666666666526087e-1;        // coefficient for x^3
68 static const double a3 = 0.041666666666666667;         // coefficient for x^4
69                                                        /*
70                                                         * two is the integer bit pattern for 2.0 in the low half of a 64-bit floating
71                                                         * point
72                                                         * number. muliples of inc added to or subtracted from two manipulates the
73                                                         * exponent
74                                                         * of 2.0, effectively raising 2.0 to a power.
75                                                         */
76 
77 static const int two = 1072693248, inc = 1048576;
78 static const int ten23 = 1023;
79 static const long long isign = 0x100000000000000;
80 
81 /* Table elements/constants for sincos... */
82 
83 static const int ts = 32; /* Maximum index into arraySC */
84 
85 /*
86  * Obviously not all these values will be use. Compiler will round them. Numbers
87  * were
88  * generated by Casio High Accuracy calculator at:
89  * http://keisan.casio.com/has10/Free.cgi
90  */
91 
92 static const double pi = 3.1415926535897932384626433832795028841971693;
93 static const double pihalves =
94     3.1415926535897932384626433832795028841971693 / 2.0;
95 static const double invpihalves =
96     2.0 / 3.1415926535897932384626433832795028841971693;
97 
98 /*
99  *Next constant is the size of the increments for the loopup table
100  */
101 
102 static const double angle_inc =
103     3.1415926535897932384626433832795028841971693 / 64.0;
104 
105 /*
106  *This is the inverse of the angle increment
107  */
108 
109 static const double invangle_inc =
110     64.0 / 3.1415926535897932384626433832795028841971693;
111 
112 static const double scArray[] = {
113     // Lookup table for approximating sincos
114     0.000000000000000000, 0.049067674327418015, 0.098017140329560604,
115     0.146730474455361748, 0.195090322016128248, 0.242980179903263871,
116     0.290284677254462331, 0.336889853392220051, 0.382683432365089782,
117     0.427555093430282085, 0.471396736825997642, 0.514102744193221661,
118     0.555570233019602178, 0.595699304492433357, 0.634393284163645488,
119     0.671558954847018330, 0.707106781186547462, 0.740951125354959106,
120     0.773010453362736993, 0.803207531480644832, 0.831469612302545236,
121     0.857728610000272118, 0.881921264348354939, 0.903989293123443227,
122     0.923879532511286738, 0.941544065183020806, 0.956940335732208935,
123     0.970031253194543974, 0.980785280403230431, 0.989176509964780903,
124     0.995184726672196929, 0.998795456205172405, 1.000000000000000000,
125 };
126 #else
127 
128 #define fabs(x) __builtin_fabs(x)
129 double __builtin_fabs(double);
130 
131 #define cos(x) __builtin_cos(x)
132 double __builtin_cos(double);
133 
134 #define sin(x) __builtin_sin(x)
135 double __builtin_sin(double);
136 
137 extern	double CONCAT(__fsd_exp_,TARGET_VEX_OR_FMA)(double);
138 
CONCAT(__rsd_exp_,TARGET_VEX_OR_FMA)139 double	CONCAT(__rsd_exp_,TARGET_VEX_OR_FMA)(double x)
140 {
141 
142   /*
143    * This algorithm was orignally developed by Peter Tang. It is based on
144    * the work described in:
145    * P. T. P. Tang. Table driven implementation of the exponential function
146    * in IEEE floating-point arithmetic. ACM Transactions on Mathematical
147    * Software, 15(2):144-157, June 1989
148    *
149    * I have short circuited some of the things Tang did for higher accuracy in
150    * the interest
151    * of performance.
152    * Testing indicates that for degree-2 polynomial, accuracy ~ 2.0x10-7 and for
153    * degree-3 polynomial, accuracy ~4.0x10-10.
154    */
155 
156   long long itemp;
157   int m, j;
158   double r, q, s, result;
159   double xprime, temp, tempx;
160   double a2r, a1r, rsq, rcube;
161   union {
162     unsigned long long i;
163     double d;
164   } convert;
165   double half = 0.5;
166   int n, n1, n2;
167   /*
168    * for now we will introduce ntemp;
169    */
170   int ntemp;
171   int addmod = 0;
172   int n1pj;
173   double n1pjln2inv32;
174   tempx = x * invl2x32; // x * 32.0/ln(2) -- scaled x
175   if (fabs(x) > 200.0)  // guard against over/underflow
176     return (CONCAT(__fsd_exp_,TARGET_VEX_OR_FMA)(x));
177   if (x < 0.0) {                // We need to know if x < 0.0
178     half = -0.5;                // For rounding to nearest
179     addmod = 32;                // Add to computer % for correct mod
180   }
181   tempx += half;      // Assures rounding to nearest
182   n = (int)(tempx);   // integer part of tempx rounded to nearest
183                       /*
184                        * Create our own modulo function
185                        *
186                        */
187   ntemp = n >> 5;     // Divide by 32
188   ntemp = ntemp << 5; // And then multiply by 32
189   j = n - ntemp;      // this is n%32 modulo a sign issue
190   n1 = n - j;         // this is almost m ...
191   m = n1 >> 5;        // ... once we divide by 32
192   n1pj = n1 + j;      // add j to n1
193   n1pjln2inv32 = ln2xinv32 * (double)n1pj; // then multiply it by ln(2)/32.0
194   r = x - n1pjln2inv32;                    // get the remainder
195   convert.i = ten23 + m;                   // 1023 << 52 = 2.0
196   convert.i = convert.i << 52;             // convert.d now has 2^m
197   s = table[j];                            // get 2^(j/32)
198   s *= convert.d;                          // * 2^m
199                                            /*
200                                             * We're going to use just a few terms of the expansion.
201                                             *
202                                             * p(t) = t + t^2/2.0 + t^3/6.0. We can add 3rd term if needed.
203                                             *
204                                             */
205   /*
206    * for degree 3 polynomial,
207    *
208    * q =  r * ( 1 + r * ( a1 + a2 * r ) )
209    *
210    */
211 
212   q = r * a2;
213   q += a1;
214   q *= r;
215   q += 1.0;
216   q *= r;
217 
218   /*
219    * For degree 2 polynomial
220    *
221    * q = r * ( 1 + a1 * r )
222    */
223   /*
224     q = a1 * r;
225     q += 1.0;
226     q *= r;
227   */
228   result = 1.0 + q; // ~exp(r)
229   result *= s;      // ~exp(x)
230   return (result);
231 }
232 
233 /*
234  * Compute the complex exponential.
235  * We have that exp( x + iy ) = exp( x ) * ( sin( y ) + i cos( y ) )
236  *
237  * For purposes of this code, we have
238  *
239  * exp( z ) = exp( real( z ) + ( sincos( imag( z ) )
240  *
241 */
242 
243 
244 extern	double	CONCAT(__rsd_exp_,TARGET_VEX_OR_FMA)(double);
245 extern	void	CONCAT(__rsd_sincos_c_,TARGET_VEX_OR_FMA)(double, double *, double *);
246 
247 void
CONCAT(__rsz_exp_,TARGET_VEX_OR_FMA)248 CONCAT(__rsz_exp_,TARGET_VEX_OR_FMA)(double *y, double xr, double xi)
249 {
250   double sinvalue, cosvalue;
251   double expvalue;
252 
253   expvalue = CONCAT(__rsd_exp_,TARGET_VEX_OR_FMA)(xr); // exp( xr )
254   /*  __rsd_sincos_c_fma4( xi, &sinvalue, &cosvalue ); // sincos( xi ) */
255   CONCAT(__rsd_sincos_c_,TARGET_VEX_OR_FMA)(xi, &y[1], &y[0]); // sincos( xi )
256                                          /*
257                                            y[ 0 ] = expvalue * cosvalue;         // exp( xr ) * cos( xi )
258                                            y[ 1 ] = expvalue * sinvalue;         // exp( xr ) * sin( xi )
259                                          */
260   y[0] = expvalue * y[0];                // exp( xr ) * cos( xi )
261   y[1] = expvalue * y[1];                // exp( xr ) * sin( xi )
262 }
263 
264 /*
265  * Because the range of input values for sines and cosines this code has to
266  * solve
267  * is limited to angles between +/- 100, and because the accuracy constraints
268  * are
269  * fairly loose, this program does the reduction phase using the most obvious
270  * method, namely dividing the input angle by pi/2.0. Higher accuracy is not
271  * required.
272  * Once the reduced angle is determined, we need six more steps.
273  * 1. Approximate the sincos for the reduced angle. We do this with a lookup
274  * table.
275  * 2. Subtract the angle for which we have entries in the table from the
276  *    reduced angle. This will leave a small residual angle for which the sine
277  *    and cosine need to be computed.
278  * 3. Use a minmax polynomial to compute the remainder angle. (See Muller, pages
279  * 58,
280  *    59)
281  * 4. Determine in which quadrant the angle resides to properly assine signs and
282  *    whether, for instance, and then decide how properly to use the information
283  *    in step 5. A modulo operation determines this.
284  * 5.. Use the trig identities for computing sines and cosines for sums of
285  * angles:
286  *    sin( a + b ) = sin( a ) * cos( b ) + cos( a ) * sin( b )
287  *    cos( a + b ) = cos( a ) * cos( b ) - sin( a ) * sin( b )
288  * 6. Correct the sign for the sine depending on whether x >= 0 or x < 0.
289  */
290 
291 /*
292  * The next two coeffiecients used for low order sincos approximation
293  */
294 
295 #define S33 -0.1666596  /* Multiplies x^3 for sin */
296 #define S22 -0.49996629 /* Multiples x^2 for cos */
297 
298 void
CONCAT(__rsd_sincos_c_,TARGET_VEX_OR_FMA)299 CONCAT(__rsd_sincos_c_,TARGET_VEX_OR_FMA)(double x, double *s, double *c)
300 {
301   double kpihalves;                      // pihalves * k
302   int k;                                 // x/pihalves
303   int sindex, cindex;                    // indices into table
304   double tsin, tcos;                     // sine and cosine from table
305   double tsinss, tsinsc, tcosss, tcossc; // products of step 5 above
306   double mysin, mycos;                   // final sine and cosine
307   double x2, x3;                         // remains**2 and remains * x2
308   double smallsin, smallcos;             // sine and cosine of remainder
309   double x2s22, x3s33;                   // x2 * s22 and x3 * s33
310   double remainder, remains;             // used to compute x - k * pi/2
311   double sign;                           // sign( x )
312   double fabsx;                          // absolute value of x
313   double r;                              // temporary storage for remainder
314   int mod4;                              // This is used to compute k%4
315 
316   fabsx = fabs(x);                     // Need |x|
317   if (fabsx < 100.0) {                 // We'll work only on "small" angles
318     sign = 1.0 - 2.0 * (x < 0.0);      // Faster than test and branch
319     k = fabsx * invpihalves;           // How many 1/2 rotations in |x|
320     kpihalves = (double)k * pihalves;  // now we should be at k * pi/2
321     remainder = fabsx - kpihalves;     // x - k * pi/2
322     sindex = remainder * invangle_inc; // generate index into sine table
323     cindex = ts - sindex;              // generate index inot cosine table
324     tsin = scArray[sindex];            // fetch sine from table
325     tcos = scArray[cindex];            // fetch cosine from table
326     r = (double)sindex * angle_inc;    // Need x - k * pi/2 - small-angle
327     remains = remainder - r;           // sin( remains ) has to be computed
328     x2 = remains * remains;            // x'^2 starts polynomial expansion
329     x3 = x2 * remains;                 // x'^3
330     x3s33 = x3 * S33;                  // x'^3 * S33
331     x2s22 = x2 * S22;                  // x'^2 * S22
332     smallsin = x3s33 + remains;        // ~sin( x' )
333     smallcos = x2s22 + 1.0;            // ~cos( x' )
334     tsinss = tsin * smallsin;          // part of sin( a + b )
335     tsinsc = tsin * smallcos;          // "                  "
336     tcosss = tcos * smallsin;          // "                  "
337     tcossc = tcos * smallcos;          // "                  "
338     mod4 = k >> 2;                     // Computing mod with shifts
339     mod4 = mod4 << 2;                  // for k%4 is faster than k%4
340     mod4 = k - mod4;                   //
341 
342     switch (mod4) { // mod4 selects the quadrant to ..
343     case 0:         // .. properly compute sincos()
344       mysin = tsinsc + tcosss;
345       mysin = sign * mysin;
346       mycos = tcossc - tsinss;
347       *s = mysin;
348       *c = mycos;
349       break;
350     case 1:
351       mysin = tcossc - tsinss;
352       mysin = sign * mysin;
353       mycos = -(tsinsc + tcosss);
354       *s = mysin;
355       *c = mycos;
356       break;
357     case 2:
358       mysin = -(tsinsc + tcosss);
359       mysin = sign * mysin;
360       mycos = tsinss - tcossc;
361       *s = mysin;
362       *c = mycos;
363       break;
364     case 3:
365       mysin = tsinss - tcossc;
366       mysin = sign * mysin;
367       mycos = tsinsc + tcosss;
368       *s = mysin;
369       *c = mycos;
370     }
371   } else {
372     *s = sin(x); // We go here if |x| > 100.0
373     *c = cos(x);
374   }
375 }
376 
377 #endif
378