1
2 /*
3 * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19
20 #include <assert.h>
21 #include <stdio.h>
22 #include <math.h>
23 #if defined(__x86_64__)
24 #include <immintrin.h>
25 #endif
26
27
28 #define SINCOS_COMMA
29 #if defined(SINE) && !(defined(COSINE) || defined(SINCOS))
30 #define S(...) __VA_ARGS__
31 #define C(...)
32 #define FNAME sin
33 #elif defined(COSINE) && !(defined(SINE) || defined(SINCOS))
34 #define S(...)
35 #define C(...) __VA_ARGS__
36 #define FNAME cos
37 #elif defined(SINCOS) && !(defined(SINE) || defined(COSINE))
38 #define S(...) __VA_ARGS__
39 #define C(...) __VA_ARGS__
40 #define FNAME sincos
41 #include <complex.h>
42 #undef SINCOS_COMMA
43 #define SINCOS_COMMA ,
44 #else
45 #error One of SINE, COSINE, or SINCOS must be defined.
46 #endif
47
48 #define _CONCAT(l,r) l##r
49 #define CONCAT(l,r) _CONCAT(l,r)
50
51 #define FCN_NAME CONCAT(CONCAT(__fd_,FNAME),_1_avx2)
52
53 #include "sincos.h"
54
55 extern "C" double
56 #if defined(SINCOS)
57 _Complex
58 #endif
59 FCN_NAME(double const x);
60
61 double
62 #if defined(SINCOS)
63 _Complex
64 #endif
FCN_NAME(double const x)65 __attribute__((noinline)) FCN_NAME(double const x)
66 {
67 S(double as, ks, rs;)
68 S(uint64_t hs;)
69 S(double ss, fs, ts;)
70
71 C(double ac, kc, rc;)
72 C(uint64_t hc;)
73 C(double sc, fc, tc;)
74
75 double t;
76 uint64_t p;
77
78 p = double_as_ll(x) & 0x7fffffffffffffffULL;
79
80 if (__builtin_expect(p > double_as_ll(THRESHOLD), 0)) {
81 // a = p >= 0x7ff0000000000000ULL ? x * 0.0 : reduction_slowpath(x, &h);
82 if (p >= 0x7ff0000000000000ULL) {
83 S(as = x * 0.0);
84 C(ac = x * 0.0);
85 } else {
86 reduction_slowpath(x, S(&as, &hs) SINCOS_COMMA C(&ac, &hc));
87 }
88 } else {
89 S(ks = FMA(x, _1_OVER_PI, 6755399441055744.0);)
90 S(hs = double_as_ll(ks) << 63;)
91 S(ks -= 6755399441055744.0;)
92
93 C(kc = FMA(ll_as_double(p), _1_OVER_PI, -0.5);)
94 C(kc += 6755399441055744.0;)
95 C(hc = double_as_ll(kc) << 63;)
96 C(kc -= 6755399441055744.0;)
97 C(kc += 0.5;)
98
99 S(as = FMA(ks, -PI_HI, x);)
100 C(ac = FMA(kc, -PI_HI, ll_as_double(p));)
101
102 S(as = FMA(ks, -PI_MI, as);)
103 C(ac = FMA(kc, -PI_MI, ac);)
104
105 S(as = FMA(ks, -PI_LO, as);)
106 C(ac = FMA(kc, -PI_LO, ac);)
107 }
108
109 #if 1 && defined(SINCOS) && defined (__x86_64__) && defined(__AVX2__)
110 {
111 const __m128d ama = _mm_set_pd(-A_D, A_D);
112 const __m128d bmb = _mm_set_pd(-B_D, B_D);
113 const __m128d cmc = _mm_set_pd(-C_D, C_D);
114 const __m128d dmd = _mm_set_pd(-D_D, D_D);
115 const __m128d eme = _mm_set_pd(-E_D, E_D);
116 const __m128d fmf = _mm_set_pd(-F_D, F_D);
117 const __m128d gmg = _mm_set_pd(-G_D, G_D);
118 const __m128d hmh = _mm_set_pd(-H_D, H_D);
119 const __m128d omo = _mm_set_pd(-1.0, 1.0);
120
121 __m128d va, vf, vs, vr, vt;
122 va = _mm_unpacklo_pd(_mm_set1_pd(as), _mm_set1_pd(ac));
123 vs = va * va;
124 vr = ama;
125 vr = _mm_fmadd_pd(vr, vs, bmb);
126 vr = _mm_fmadd_pd(vr, vs, cmc);
127 vr = _mm_fmadd_pd(vr, vs, dmd);
128 vr = _mm_fmadd_pd(vr, vs, eme);
129 vr = _mm_fmadd_pd(vr, vs, fmf);
130 vr = _mm_fmadd_pd(vr, vs, gmg);
131 vr = _mm_fmadd_pd(vr, vs, hmh);
132 vf = _mm_unpacklo_pd(_mm_castsi128_pd(_mm_set1_epi64x(hs)), _mm_castsi128_pd(_mm_set1_epi64x(hc)));
133
134 vf = _mm_xor_pd(va, vf);
135 vt = _mm_fmadd_pd(vs, vf, _mm_set1_pd(0.0));
136 vf = _mm_mul_pd(vf, omo);
137 vr = _mm_fmadd_pd(vr, vt, vf);
138
139 rs = _mm_cvtsd_f64(vr);
140 rc = _mm_cvtsd_f64(_mm_permute_pd(vr, 1));
141 }
142 #else
143 S(ss = as * as;)
144 C(sc = ac * ac;)
145
146 S(rs = A_D;)
147 C(rc = -A_D;)
148
149 S(rs = FMA(rs, ss, B_D);)
150 C(rc = FMA(rc, sc, -B_D);)
151
152 S(rs = FMA(rs, ss, C_D);)
153 C(rc = FMA(rc, sc, -C_D);)
154
155 S(rs = FMA(rs, ss, D_D);)
156 C(rc = FMA(rc, sc, -D_D);)
157
158 S(rs = FMA(rs, ss, E_D);)
159 C(rc = FMA(rc, sc, -E_D);)
160
161 S(rs = FMA(rs, ss, F_D);)
162 C(rc = FMA(rc, sc, -F_D);)
163
164 S(rs = FMA(rs, ss, G_D);)
165 C(rc = FMA(rc, sc, -G_D);)
166
167 S(rs = FMA(rs, ss, H_D);)
168 C(rc = FMA(rc, sc, -H_D);)
169
170 S(fs = ll_as_double(double_as_ll(as) ^ hs);)
171 C(fc = ll_as_double(double_as_ll(ac) ^ hc);)
172
173 S(ts = FMA(ss, fs, 0.0);)
174 C(tc = FMA(sc, fc, 0.0);)
175
176 S(rs = FMA(rs, ts, fs);)
177 C(rc = FMA(rc, tc, -fc);)
178 #endif
179
180
181 #if defined(SINCOS)
182 struct {
183 union {
184 double _Complex c;
185 double d[2];
186 };
187 } ret_cmplx;
188 ret_cmplx.d[0] = rs;
189 ret_cmplx.d[1] = rc;
190
191 return ret_cmplx.c;
192 #else
193 S(return rs;)
194 C(return rc;)
195 #endif
196 }
197
198
199 #ifdef UNIT_TEST
200 int
main()201 main()
202 {
203 //float a = 40000+M_PI/6;
204 //float a = -40000-M_PI/6;
205 double a = -M_PI;
206 double args[] = {
207 -M_PI+(-M_PI/6),//, -M_PI/6,
208 -0.0,
209 -(THRESHOLD*2)-M_PI/6, (THRESHOLD*2)+M_PI/6,
210 -1, 0, 1,
211 -M_PI*100, M_PI*100,
212 (-M_PI+(M_PI/6))*100.0,
213 -M_PI-(M_PI/6), M_PI+(M_PI/6),
214 -M_PI/6, M_PI/6,
215 0.1250000,
216 };
217 double rs;
218 double rc;
219 //printf("THRESHOLD=%f\n", THRESHOLD);
220 #ifdef SINCOS
221 double _Complex ri;
222
223 for (int i = 0 ; i < sizeof args / sizeof *args; ++i) {
224 a = args[i];
225 printf("%f\n", a);
226 ri = FCN_NAME(a);
227 printf("sincos:sin=%f %f %f\n", creal(ri), sin(a), creal(ri)-sin(a));
228 printf("sincos:cos=%f %f %f\n", cimag(ri), cos(a), cimag(ri)-cos(a));
229 }
230 #else
231 for (int i = 0 ; i < sizeof args / sizeof *args; ++i) {
232 a = args[i];
233 printf("%f\n", a);
234 S(rs = FCN_NAME(a);)
235 C(rc = FCN_NAME(a);)
236 S(printf("sin=%f %f %f\n", rs, sin(a), rs-sin(a));)
237 C(printf("cos=%f %f %f\n", rc, cos(a), rc-cos(a));)
238 }
239
240 #endif
241
242 return 0;
243 }
244 #endif
245 // vim: ts=4 expandtab
246
247