1 
2 /*
3  * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 
20 #ifndef __TAN_F_SCALAR_H__
21 #define __TAN_F_SCALAR_H__
22 
23 
24 #include <assert.h>
25 #include <stdio.h>
26 #include <math.h>
27 #include <immintrin.h>
28 #include "common_tanf.h"
29 
30 extern "C" float __attribute__ ((noinline)) __fs_tan_1_avx2(float const a);
31 
32 
33 /* Payne-Hanek style argument reduction. */
34 static float
reduction_slowpath(float const a,int32_t * h)35 reduction_slowpath(float const a, int32_t *h)
36 {
37     uint2 m;
38     uint32_t ia = float_as_int(a);
39     uint32_t s = ia & 0x80000000;
40     uint32_t result[7];
41     uint32_t hi, lo;
42     uint32_t e;
43     int32_t idx;
44     int32_t q;
45     e = ((ia >> 23) & 0xff) - 127;
46     ia = (ia << 8) | 0x80000000;
47 
48     /* compute x * 2/pi */
49     idx = 4 - ((e >> 5) & 3);
50 
51     hi = 0;
52     for (q = 0; q < 6; q++) {
53         m = umad32wide(i2opi_f[q], ia, hi);
54         lo = m.x;
55         hi = m.y;
56         result[q] = lo;
57     }
58     result[q] = hi;
59 
60     e = e & 31;
61     /* shift result such that hi:lo<63:63> is the least significant
62        integer bit, and hi:lo<62:0> are the fractional bits of the result
63     */
64     uint64_t p = ((uint64_t)result[idx + 2] << 32) | result[idx + 1];
65 
66     if (e) {
67         q = 32 - e;
68         p = (p << e) | (result[idx] >> q);
69     }
70 
71     /* fraction */
72     q = (result[idx + 2] << e) & 0x80000000;
73     p &= 0x7fffffffffffffffULL;
74 
75     if (p & 0x4000000000000000ULL) {
76         p |= 0x8000000000000000ULL;
77         q ^= 0x80000000;
78     }
79     *h = q;
80 
81     double d = (double)(int64_t)p;
82     d *= PI_2_M64;
83     float r = (float)d;
84 
85     return int_as_float(float_as_int(r) ^ s);
86 }
87 
88 float __attribute__ ((noinline))
__fs_tan_1_avx2(float x)89 __fs_tan_1_avx2(float x)
90 {
91 
92     float p, k, r, s, t;
93     int h = 0;
94 
95     p   = int_as_float(float_as_int(x) & 0x7fffffff);
96     if (float_as_int(p) > float_as_int(THRESHOLD_F)) {
97         x = float_as_int(p) >= 0x7f800000 ? x * 0.0f : reduction_slowpath(x, &h);
98     } else {
99         k = FMAF(x, _2_OVER_PI_F, 12582912.0f);
100         h = float_as_int(k) << 31;
101         k -= 12582912.0f;
102         x = FMAF(k, -PI_2_HI_F, x);
103         x = FMAF(k, -PI_2_MI_F, x);
104         x = FMAF(k, -PI_2_LO_F, x);
105     }
106     s = x * x;
107     r = A_F;
108     r = FMAF(r, s, B_F);
109     r = FMAF(r, s, C_F);
110     r = FMAF(r, s, D_F);
111     r = FMAF(r, s, E_F);
112     r = FMAF(r, s, F_F);
113     t = s * x;
114     r = FMAF(r, t, x);
115 
116     if (h) r = -1.0f / r;
117 
118     return r;
119 }
120 
121 #endif // __TAN_F_SCALAR_H__
122 
123