1
2 /*
3 * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19
20 #ifndef __TAN_F_SCALAR_H__
21 #define __TAN_F_SCALAR_H__
22
23
24 #include <assert.h>
25 #include <stdio.h>
26 #include <math.h>
27 #include <immintrin.h>
28 #include "common_tanf.h"
29
30 extern "C" float __attribute__ ((noinline)) __fs_tan_1_avx2(float const a);
31
32
33 /* Payne-Hanek style argument reduction. */
34 static float
reduction_slowpath(float const a,int32_t * h)35 reduction_slowpath(float const a, int32_t *h)
36 {
37 uint2 m;
38 uint32_t ia = float_as_int(a);
39 uint32_t s = ia & 0x80000000;
40 uint32_t result[7];
41 uint32_t hi, lo;
42 uint32_t e;
43 int32_t idx;
44 int32_t q;
45 e = ((ia >> 23) & 0xff) - 127;
46 ia = (ia << 8) | 0x80000000;
47
48 /* compute x * 2/pi */
49 idx = 4 - ((e >> 5) & 3);
50
51 hi = 0;
52 for (q = 0; q < 6; q++) {
53 m = umad32wide(i2opi_f[q], ia, hi);
54 lo = m.x;
55 hi = m.y;
56 result[q] = lo;
57 }
58 result[q] = hi;
59
60 e = e & 31;
61 /* shift result such that hi:lo<63:63> is the least significant
62 integer bit, and hi:lo<62:0> are the fractional bits of the result
63 */
64 uint64_t p = ((uint64_t)result[idx + 2] << 32) | result[idx + 1];
65
66 if (e) {
67 q = 32 - e;
68 p = (p << e) | (result[idx] >> q);
69 }
70
71 /* fraction */
72 q = (result[idx + 2] << e) & 0x80000000;
73 p &= 0x7fffffffffffffffULL;
74
75 if (p & 0x4000000000000000ULL) {
76 p |= 0x8000000000000000ULL;
77 q ^= 0x80000000;
78 }
79 *h = q;
80
81 double d = (double)(int64_t)p;
82 d *= PI_2_M64;
83 float r = (float)d;
84
85 return int_as_float(float_as_int(r) ^ s);
86 }
87
88 float __attribute__ ((noinline))
__fs_tan_1_avx2(float x)89 __fs_tan_1_avx2(float x)
90 {
91
92 float p, k, r, s, t;
93 int h = 0;
94
95 p = int_as_float(float_as_int(x) & 0x7fffffff);
96 if (float_as_int(p) > float_as_int(THRESHOLD_F)) {
97 x = float_as_int(p) >= 0x7f800000 ? x * 0.0f : reduction_slowpath(x, &h);
98 } else {
99 k = FMAF(x, _2_OVER_PI_F, 12582912.0f);
100 h = float_as_int(k) << 31;
101 k -= 12582912.0f;
102 x = FMAF(k, -PI_2_HI_F, x);
103 x = FMAF(k, -PI_2_MI_F, x);
104 x = FMAF(k, -PI_2_LO_F, x);
105 }
106 s = x * x;
107 r = A_F;
108 r = FMAF(r, s, B_F);
109 r = FMAF(r, s, C_F);
110 r = FMAF(r, s, D_F);
111 r = FMAF(r, s, E_F);
112 r = FMAF(r, s, F_F);
113 t = s * x;
114 r = FMAF(r, t, x);
115
116 if (h) r = -1.0f / r;
117
118 return r;
119 }
120
121 #endif // __TAN_F_SCALAR_H__
122
123