1 
2 /*
3  * Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 
20 
21 #include <immintrin.h>
22 #include <common.h>
23 
24 #define FMA __builtin_fma
25 
26 #if !(defined _CPU)
27 #error: please define _CPU - specific suffix to a function name
28 #endif
29 
30 #define _JOIN2(a,b) a##b
31 #define JOIN2(a,b) _JOIN2(a,b)
32 
33 #define log10_scalar JOIN2(__fd_log10_1_,_CPU)
34 
35 
36 extern "C" double log10_scalar(double);
37 
38 
__log10_d_scalar_kernel(double m,double e)39 static double __attribute__ ((always_inline)) inline __log10_d_scalar_kernel(double m, double e)
40 {
41     e = e * LOG10_2[0];
42     m = m - 1.0;
43 
44     double m2 = m * m;
45     double m4 = m2 * m2;
46     double m5 = m4 * m;
47     double m8 = m4 * m4;
48 
49     double t0 = FMA(c0[0], m, c1[0]);
50     double t1 = FMA(c2[0], m, c3[0]);
51     double t2 = FMA(c4[0], m, c5[0]);
52     double t3 = FMA(c6[0], m, c7[0]);
53     double t4 = FMA(c8[0], m, c9[0]);
54     double t5 = FMA(c10[0], m, c11[0]);
55     double t6 = FMA(c12[0], m, c13[0]);
56     double t7 = FMA(c14[0], m, c15[0]);
57 
58     double t = c16[0];
59     t = FMA(t, m, c17[0]);
60     t = FMA(t, m, c18[0]);
61     t = FMA(t, m, c19[0]);
62     t = FMA(t, m, e);
63 
64     t0 = FMA(t0, m2, t1);
65     t2 = FMA(t2, m2, t3);
66     t4 = FMA(t4, m2, t5);
67     t6 = FMA(t6, m2, t7);
68     t0 = FMA(t0, m4, t2);
69     t4 = FMA(t4, m4, t6);
70     t0 = FMA(t0, m8, t4);
71 
72     t = FMA(t0, m5, t);
73 
74     return t;
75 }
76 
log10_scalar(double a_input)77 double __attribute__ ((noinline)) log10_scalar(double a_input)
78 {
79     __m128d va, vm, ve, vb;
80     double a, m, e, b, t;
81     long long  mu, eu;
82 
83 
84 #ifdef __AVX512F__
85     va = _mm_set_sd(a_input);
86     vm = _mm_getmant_sd(va, va, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan);
87     ve = _mm_getexp_sd(va, va);
88     vb = _mm_getexp_sd(vm, vm);
89     ve = _mm_sub_sd(ve, vb);
90     m = _mm_cvtsd_f64(vm);
91     e = _mm_cvtsd_f64(ve);
92 #else
93     int exp_offset = 1023;
94     unsigned long long u = double_as_ll(a_input);
95     u -= 0x10000000000000LL;
96     if (__builtin_expect(u >= 0x7fe0000000000000LL, 0)) {
97         if (a_input != a_input) return a_input + a_input; // NaN
98         if (a_input < 0.0) return ll_as_double(CANONICAL_NAN[0]); // negative
99         if (a_input == 0.0) return ll_as_double(NINF[0]); // zero
100         if (double_as_ll(a_input) == PINF[0]) return ll_as_double(PINF[0]); // +infinity
101         a_input *= TWO_TO_53; // denormals
102         exp_offset += 53;
103         mu = double_as_ll(a_input);
104         mu -= double_as_ll(THRESHOLD[0]);
105         eu = (mu >> 52) - 53;
106         mu &= MANTISSA_MASK[0];
107         mu += double_as_ll(THRESHOLD[0]);
108         m = ll_as_double(mu);
109         e = (double)eu;
110         return __log10_d_scalar_kernel(m, e);
111     }
112     mu = double_as_ll(a_input);
113     mu -= double_as_ll(THRESHOLD[0]);
114     eu = mu >> 52;
115     mu &= MANTISSA_MASK[0];
116     mu += double_as_ll(THRESHOLD[0]);
117     m = ll_as_double(mu);
118     e = (double)eu;
119 #endif
120 
121     return __log10_d_scalar_kernel(m, e);
122 }
123 
124