1 /*
2 * Copyright (c) 2012-2018, NVIDIA CORPORATION. All rights reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 */
17
18 #define CONCAT_(l,r) l##r
19 #define CONCAT(l,r) CONCAT_(l,r)
20
21 #ifdef TABLE_TARGET
22
23 /* #include <math.h> */
24
25 /* Table elements/constants for exp... */
26
27 static const double table[] = {1.0,
28 1.0218971486541166,
29 1.0442737824274138,
30 1.0671404006768237,
31 1.0905077326652577,
32 1.1143867425958924,
33 1.1387886347566916,
34 1.1637248587775775,
35 1.1892071150027210,
36 1.2152473599804690,
37 1.2418578120734840,
38 1.2690509571917332,
39 1.2968395546510096,
40 1.3252366431597413,
41 1.3542555469368927,
42 1.3839098819638320,
43 1.4142135623730951,
44 1.4451808069770467,
45 1.4768261459394993,
46 1.5091644275934228,
47 1.5422108254079407,
48 1.5759808451078865,
49 1.6104903319492543,
50 1.6457554781539650,
51 1.6817928305074290,
52 1.7186192981224780,
53 1.7562521603732995,
54 1.7947090750031072,
55 1.8340080864093424,
56 1.8741676341103000,
57 1.9152065613971474,
58 1.9571441241754002,
59 2.0};
60
61 static const double invl2x32 = 46.166241308446829; // 32.0/ln(2.0)
62 static const double ln2xinv32 = 0.0216608493924982909; // ln(2.0)/32.0
63 static const double inv32 = 3.125e-2; // 1.0/32.0
64 static const double ln2 = 0.693147180559945309; // ln(2.0)
65 static const double invln2 = 1.44269504088896341; // 1.0/ln(2.0)
66 static const double a1 = 5.0000000000000000e-1; // coefficient for x^2
67 static const double a2 = 1.6666666666526087e-1; // coefficient for x^3
68 static const double a3 = 0.041666666666666667; // coefficient for x^4
69 /*
70 * two is the integer bit pattern for 2.0 in the low half of a 64-bit floating
71 * point
72 * number. muliples of inc added to or subtracted from two manipulates the
73 * exponent
74 * of 2.0, effectively raising 2.0 to a power.
75 */
76
77 static const int two = 1072693248, inc = 1048576;
78 static const int ten23 = 1023;
79 static const long long isign = 0x100000000000000;
80
81 /* Table elements/constants for sincos... */
82
83 static const int ts = 32; /* Maximum index into arraySC */
84
85 /*
86 * Obviously not all these values will be use. Compiler will round them. Numbers
87 * were
88 * generated by Casio High Accuracy calculator at:
89 * http://keisan.casio.com/has10/Free.cgi
90 */
91
92 static const double pi = 3.1415926535897932384626433832795028841971693;
93 static const double pihalves =
94 3.1415926535897932384626433832795028841971693 / 2.0;
95 static const double invpihalves =
96 2.0 / 3.1415926535897932384626433832795028841971693;
97
98 /*
99 *Next constant is the size of the increments for the loopup table
100 */
101
102 static const double angle_inc =
103 3.1415926535897932384626433832795028841971693 / 64.0;
104
105 /*
106 *This is the inverse of the angle increment
107 */
108
109 static const double invangle_inc =
110 64.0 / 3.1415926535897932384626433832795028841971693;
111
112 static const double scArray[] = {
113 // Lookup table for approximating sincos
114 0.000000000000000000, 0.049067674327418015, 0.098017140329560604,
115 0.146730474455361748, 0.195090322016128248, 0.242980179903263871,
116 0.290284677254462331, 0.336889853392220051, 0.382683432365089782,
117 0.427555093430282085, 0.471396736825997642, 0.514102744193221661,
118 0.555570233019602178, 0.595699304492433357, 0.634393284163645488,
119 0.671558954847018330, 0.707106781186547462, 0.740951125354959106,
120 0.773010453362736993, 0.803207531480644832, 0.831469612302545236,
121 0.857728610000272118, 0.881921264348354939, 0.903989293123443227,
122 0.923879532511286738, 0.941544065183020806, 0.956940335732208935,
123 0.970031253194543974, 0.980785280403230431, 0.989176509964780903,
124 0.995184726672196929, 0.998795456205172405, 1.000000000000000000,
125 };
126 #else
127
128 #define fabs(x) __builtin_fabs(x)
129 double __builtin_fabs(double);
130
131 #define cos(x) __builtin_cos(x)
132 double __builtin_cos(double);
133
134 #define sin(x) __builtin_sin(x)
135 double __builtin_sin(double);
136
137 extern double CONCAT(__fsd_exp_,TARGET_VEX_OR_FMA)(double);
138
CONCAT(__rsd_exp_,TARGET_VEX_OR_FMA)139 double CONCAT(__rsd_exp_,TARGET_VEX_OR_FMA)(double x)
140 {
141
142 /*
143 * This algorithm was orignally developed by Peter Tang. It is based on
144 * the work described in:
145 * P. T. P. Tang. Table driven implementation of the exponential function
146 * in IEEE floating-point arithmetic. ACM Transactions on Mathematical
147 * Software, 15(2):144-157, June 1989
148 *
149 * I have short circuited some of the things Tang did for higher accuracy in
150 * the interest
151 * of performance.
152 * Testing indicates that for degree-2 polynomial, accuracy ~ 2.0x10-7 and for
153 * degree-3 polynomial, accuracy ~4.0x10-10.
154 */
155
156 long long itemp;
157 int m, j;
158 double r, q, s, result;
159 double xprime, temp, tempx;
160 double a2r, a1r, rsq, rcube;
161 union {
162 unsigned long long i;
163 double d;
164 } convert;
165 double half = 0.5;
166 int n, n1, n2;
167 /*
168 * for now we will introduce ntemp;
169 */
170 int ntemp;
171 int addmod = 0;
172 int n1pj;
173 double n1pjln2inv32;
174 tempx = x * invl2x32; // x * 32.0/ln(2) -- scaled x
175 if (fabs(x) > 200.0) // guard against over/underflow
176 return (CONCAT(__fsd_exp_,TARGET_VEX_OR_FMA)(x));
177 if (x < 0.0) { // We need to know if x < 0.0
178 half = -0.5; // For rounding to nearest
179 addmod = 32; // Add to computer % for correct mod
180 }
181 tempx += half; // Assures rounding to nearest
182 n = (int)(tempx); // integer part of tempx rounded to nearest
183 /*
184 * Create our own modulo function
185 *
186 */
187 ntemp = n >> 5; // Divide by 32
188 ntemp = ntemp << 5; // And then multiply by 32
189 j = n - ntemp; // this is n%32 modulo a sign issue
190 n1 = n - j; // this is almost m ...
191 m = n1 >> 5; // ... once we divide by 32
192 n1pj = n1 + j; // add j to n1
193 n1pjln2inv32 = ln2xinv32 * (double)n1pj; // then multiply it by ln(2)/32.0
194 r = x - n1pjln2inv32; // get the remainder
195 convert.i = ten23 + m; // 1023 << 52 = 2.0
196 convert.i = convert.i << 52; // convert.d now has 2^m
197 s = table[j]; // get 2^(j/32)
198 s *= convert.d; // * 2^m
199 /*
200 * We're going to use just a few terms of the expansion.
201 *
202 * p(t) = t + t^2/2.0 + t^3/6.0. We can add 3rd term if needed.
203 *
204 */
205 /*
206 * for degree 3 polynomial,
207 *
208 * q = r * ( 1 + r * ( a1 + a2 * r ) )
209 *
210 */
211
212 q = r * a2;
213 q += a1;
214 q *= r;
215 q += 1.0;
216 q *= r;
217
218 /*
219 * For degree 2 polynomial
220 *
221 * q = r * ( 1 + a1 * r )
222 */
223 /*
224 q = a1 * r;
225 q += 1.0;
226 q *= r;
227 */
228 result = 1.0 + q; // ~exp(r)
229 result *= s; // ~exp(x)
230 return (result);
231 }
232
233 /*
234 * Compute the complex exponential.
235 * We have that exp( x + iy ) = exp( x ) * ( sin( y ) + i cos( y ) )
236 *
237 * For purposes of this code, we have
238 *
239 * exp( z ) = exp( real( z ) + ( sincos( imag( z ) )
240 *
241 */
242
243
244 extern double CONCAT(__rsd_exp_,TARGET_VEX_OR_FMA)(double);
245 extern void CONCAT(__rsd_sincos_c_,TARGET_VEX_OR_FMA)(double, double *, double *);
246
247 void
CONCAT(__rsz_exp_,TARGET_VEX_OR_FMA)248 CONCAT(__rsz_exp_,TARGET_VEX_OR_FMA)(double *y, double xr, double xi)
249 {
250 double sinvalue, cosvalue;
251 double expvalue;
252
253 expvalue = CONCAT(__rsd_exp_,TARGET_VEX_OR_FMA)(xr); // exp( xr )
254 /* __rsd_sincos_c_fma4( xi, &sinvalue, &cosvalue ); // sincos( xi ) */
255 CONCAT(__rsd_sincos_c_,TARGET_VEX_OR_FMA)(xi, &y[1], &y[0]); // sincos( xi )
256 /*
257 y[ 0 ] = expvalue * cosvalue; // exp( xr ) * cos( xi )
258 y[ 1 ] = expvalue * sinvalue; // exp( xr ) * sin( xi )
259 */
260 y[0] = expvalue * y[0]; // exp( xr ) * cos( xi )
261 y[1] = expvalue * y[1]; // exp( xr ) * sin( xi )
262 }
263
264 /*
265 * Because the range of input values for sines and cosines this code has to
266 * solve
267 * is limited to angles between +/- 100, and because the accuracy constraints
268 * are
269 * fairly loose, this program does the reduction phase using the most obvious
270 * method, namely dividing the input angle by pi/2.0. Higher accuracy is not
271 * required.
272 * Once the reduced angle is determined, we need six more steps.
273 * 1. Approximate the sincos for the reduced angle. We do this with a lookup
274 * table.
275 * 2. Subtract the angle for which we have entries in the table from the
276 * reduced angle. This will leave a small residual angle for which the sine
277 * and cosine need to be computed.
278 * 3. Use a minmax polynomial to compute the remainder angle. (See Muller, pages
279 * 58,
280 * 59)
281 * 4. Determine in which quadrant the angle resides to properly assine signs and
282 * whether, for instance, and then decide how properly to use the information
283 * in step 5. A modulo operation determines this.
284 * 5.. Use the trig identities for computing sines and cosines for sums of
285 * angles:
286 * sin( a + b ) = sin( a ) * cos( b ) + cos( a ) * sin( b )
287 * cos( a + b ) = cos( a ) * cos( b ) - sin( a ) * sin( b )
288 * 6. Correct the sign for the sine depending on whether x >= 0 or x < 0.
289 */
290
291 /*
292 * The next two coeffiecients used for low order sincos approximation
293 */
294
295 #define S33 -0.1666596 /* Multiplies x^3 for sin */
296 #define S22 -0.49996629 /* Multiples x^2 for cos */
297
298 void
CONCAT(__rsd_sincos_c_,TARGET_VEX_OR_FMA)299 CONCAT(__rsd_sincos_c_,TARGET_VEX_OR_FMA)(double x, double *s, double *c)
300 {
301 double kpihalves; // pihalves * k
302 int k; // x/pihalves
303 int sindex, cindex; // indices into table
304 double tsin, tcos; // sine and cosine from table
305 double tsinss, tsinsc, tcosss, tcossc; // products of step 5 above
306 double mysin, mycos; // final sine and cosine
307 double x2, x3; // remains**2 and remains * x2
308 double smallsin, smallcos; // sine and cosine of remainder
309 double x2s22, x3s33; // x2 * s22 and x3 * s33
310 double remainder, remains; // used to compute x - k * pi/2
311 double sign; // sign( x )
312 double fabsx; // absolute value of x
313 double r; // temporary storage for remainder
314 int mod4; // This is used to compute k%4
315
316 fabsx = fabs(x); // Need |x|
317 if (fabsx < 100.0) { // We'll work only on "small" angles
318 sign = 1.0 - 2.0 * (x < 0.0); // Faster than test and branch
319 k = fabsx * invpihalves; // How many 1/2 rotations in |x|
320 kpihalves = (double)k * pihalves; // now we should be at k * pi/2
321 remainder = fabsx - kpihalves; // x - k * pi/2
322 sindex = remainder * invangle_inc; // generate index into sine table
323 cindex = ts - sindex; // generate index inot cosine table
324 tsin = scArray[sindex]; // fetch sine from table
325 tcos = scArray[cindex]; // fetch cosine from table
326 r = (double)sindex * angle_inc; // Need x - k * pi/2 - small-angle
327 remains = remainder - r; // sin( remains ) has to be computed
328 x2 = remains * remains; // x'^2 starts polynomial expansion
329 x3 = x2 * remains; // x'^3
330 x3s33 = x3 * S33; // x'^3 * S33
331 x2s22 = x2 * S22; // x'^2 * S22
332 smallsin = x3s33 + remains; // ~sin( x' )
333 smallcos = x2s22 + 1.0; // ~cos( x' )
334 tsinss = tsin * smallsin; // part of sin( a + b )
335 tsinsc = tsin * smallcos; // " "
336 tcosss = tcos * smallsin; // " "
337 tcossc = tcos * smallcos; // " "
338 mod4 = k >> 2; // Computing mod with shifts
339 mod4 = mod4 << 2; // for k%4 is faster than k%4
340 mod4 = k - mod4; //
341
342 switch (mod4) { // mod4 selects the quadrant to ..
343 case 0: // .. properly compute sincos()
344 mysin = tsinsc + tcosss;
345 mysin = sign * mysin;
346 mycos = tcossc - tsinss;
347 *s = mysin;
348 *c = mycos;
349 break;
350 case 1:
351 mysin = tcossc - tsinss;
352 mysin = sign * mysin;
353 mycos = -(tsinsc + tcosss);
354 *s = mysin;
355 *c = mycos;
356 break;
357 case 2:
358 mysin = -(tsinsc + tcosss);
359 mysin = sign * mysin;
360 mycos = tsinss - tcossc;
361 *s = mysin;
362 *c = mycos;
363 break;
364 case 3:
365 mysin = tsinss - tcossc;
366 mysin = sign * mysin;
367 mycos = tsinsc + tcosss;
368 *s = mysin;
369 *c = mycos;
370 }
371 } else {
372 *s = sin(x); // We go here if |x| > 100.0
373 *c = cos(x);
374 }
375 }
376
377 #endif
378