1 
2 /*
3  * Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 
20 #include <assert.h>
21 #include <stdio.h>
22 #include <math.h>
23 #if     defined(__x86_64__)
24 #include <immintrin.h>
25 #endif
26 
27 
28 #define SINCOS_COMMA
29 #if     defined(SINE) && !(defined(COSINE) || defined(SINCOS))
30 #define S(...) __VA_ARGS__
31 #define C(...)
32 #define FNAME   sin
33 #elif   defined(COSINE) && !(defined(SINE) || defined(SINCOS))
34 #define S(...)
35 #define C(...) __VA_ARGS__
36 #define FNAME   cos
37 #elif   defined(SINCOS) && !(defined(SINE) || defined(COSINE))
38 #define S(...) __VA_ARGS__
39 #define C(...) __VA_ARGS__
40 #define FNAME   sincos
41 #include <complex.h>
42 #undef  SINCOS_COMMA
43 #define SINCOS_COMMA    ,
44 #else
45 #error  One of SINE, COSINE, or SINCOS must be defined.
46 #endif
47 
48 #define _CONCAT(l,r) l##r
49 #define CONCAT(l,r) _CONCAT(l,r)
50 
51 #define FCN_NAME    CONCAT(CONCAT(__fd_,FNAME),_1_avx2)
52 
53 #include "sincos.h"
54 
55 extern	"C" double
56 #if     defined(SINCOS)
57     _Complex
58 #endif
59 FCN_NAME(double const x);
60 
61 double
62 #if     defined(SINCOS)
63 _Complex
64 #endif
FCN_NAME(double const x)65 __attribute__((noinline)) FCN_NAME(double const x)
66 {
67     S(double as, ks, rs;)
68     S(uint64_t hs;)
69     S(double ss, fs, ts;)
70 
71     C(double ac, kc, rc;)
72     C(uint64_t hc;)
73     C(double sc, fc, tc;)
74 
75     double t;
76     uint64_t p;
77 
78     p = double_as_ll(x) & 0x7fffffffffffffffULL;
79 
80     if (__builtin_expect(p > double_as_ll(THRESHOLD), 0)) {
81 //        a = p >= 0x7ff0000000000000ULL ? x * 0.0 : reduction_slowpath(x, &h);
82         if (p >= 0x7ff0000000000000ULL) {
83             S(as = x * 0.0);
84             C(ac = x * 0.0);
85         } else {
86             reduction_slowpath(x, S(&as, &hs) SINCOS_COMMA C(&ac, &hc));
87         }
88     } else {
89         S(ks = FMA(x, _1_OVER_PI, 6755399441055744.0);)
90         S(hs = double_as_ll(ks) << 63;)
91         S(ks -= 6755399441055744.0;)
92 
93         C(kc = FMA(ll_as_double(p), _1_OVER_PI, -0.5);)
94         C(kc += 6755399441055744.0;)
95         C(hc = double_as_ll(kc) << 63;)
96         C(kc -= 6755399441055744.0;)
97         C(kc += 0.5;)
98 
99         S(as = FMA(ks, -PI_HI, x);)
100         C(ac = FMA(kc, -PI_HI, ll_as_double(p));)
101 
102         S(as = FMA(ks, -PI_MI, as);)
103         C(ac = FMA(kc, -PI_MI, ac);)
104 
105         S(as = FMA(ks, -PI_LO, as);)
106         C(ac = FMA(kc, -PI_LO, ac);)
107     }
108 
109 #if     1 && defined(SINCOS) && defined (__x86_64__) && defined(__AVX2__)
110     {
111     const __m128d ama = _mm_set_pd(-A_D, A_D);
112     const __m128d bmb = _mm_set_pd(-B_D, B_D);
113     const __m128d cmc = _mm_set_pd(-C_D, C_D);
114     const __m128d dmd = _mm_set_pd(-D_D, D_D);
115     const __m128d eme = _mm_set_pd(-E_D, E_D);
116     const __m128d fmf = _mm_set_pd(-F_D, F_D);
117     const __m128d gmg = _mm_set_pd(-G_D, G_D);
118     const __m128d hmh = _mm_set_pd(-H_D, H_D);
119     const __m128d omo = _mm_set_pd(-1.0, 1.0);
120 
121     __m128d va, vf, vs, vr, vt;
122     va = _mm_unpacklo_pd(_mm_set1_pd(as), _mm_set1_pd(ac));
123     vs = va * va;
124     vr = ama;
125     vr = _mm_fmadd_pd(vr, vs, bmb);
126     vr = _mm_fmadd_pd(vr, vs, cmc);
127     vr = _mm_fmadd_pd(vr, vs, dmd);
128     vr = _mm_fmadd_pd(vr, vs, eme);
129     vr = _mm_fmadd_pd(vr, vs, fmf);
130     vr = _mm_fmadd_pd(vr, vs, gmg);
131     vr = _mm_fmadd_pd(vr, vs, hmh);
132     vf = _mm_unpacklo_pd(_mm_castsi128_pd(_mm_set1_epi64x(hs)), _mm_castsi128_pd(_mm_set1_epi64x(hc)));
133 
134     vf = _mm_xor_pd(va, vf);
135     vt = _mm_fmadd_pd(vs, vf, _mm_set1_pd(0.0));
136     vf = _mm_mul_pd(vf, omo);
137     vr = _mm_fmadd_pd(vr, vt, vf);
138 
139     rs = _mm_cvtsd_f64(vr);
140     rc = _mm_cvtsd_f64(_mm_permute_pd(vr, 1));
141     }
142 #else
143     S(ss = as * as;)
144     C(sc = ac * ac;)
145 
146     S(rs = A_D;)
147     C(rc = -A_D;)
148 
149     S(rs = FMA(rs, ss, B_D);)
150     C(rc = FMA(rc, sc, -B_D);)
151 
152     S(rs = FMA(rs, ss, C_D);)
153     C(rc = FMA(rc, sc, -C_D);)
154 
155     S(rs = FMA(rs, ss, D_D);)
156     C(rc = FMA(rc, sc, -D_D);)
157 
158     S(rs = FMA(rs, ss, E_D);)
159     C(rc = FMA(rc, sc, -E_D);)
160 
161     S(rs = FMA(rs, ss, F_D);)
162     C(rc = FMA(rc, sc, -F_D);)
163 
164     S(rs = FMA(rs, ss, G_D);)
165     C(rc = FMA(rc, sc, -G_D);)
166 
167     S(rs = FMA(rs, ss, H_D);)
168     C(rc = FMA(rc, sc, -H_D);)
169 
170     S(fs = ll_as_double(double_as_ll(as) ^ hs);)
171     C(fc = ll_as_double(double_as_ll(ac) ^ hc);)
172 
173     S(ts = FMA(ss, fs, 0.0);)
174     C(tc = FMA(sc, fc, 0.0);)
175 
176     S(rs = FMA(rs, ts, fs);)
177     C(rc = FMA(rc, tc, -fc);)
178 #endif
179 
180 
181 #if     defined(SINCOS)
182     struct {
183         union {
184             double _Complex c;
185             double          d[2];
186         };
187     } ret_cmplx;
188     ret_cmplx.d[0] = rs;
189     ret_cmplx.d[1] = rc;
190 
191     return ret_cmplx.c;
192 #else
193     S(return rs;)
194     C(return rc;)
195 #endif
196 }
197 
198 
199 #ifdef  UNIT_TEST
200 int
main()201 main()
202 {
203     //float a = 40000+M_PI/6;
204     //float a = -40000-M_PI/6;
205     double a = -M_PI;
206     double args[] = {
207                     -M_PI+(-M_PI/6),//, -M_PI/6,
208                     -0.0,
209                     -(THRESHOLD*2)-M_PI/6, (THRESHOLD*2)+M_PI/6,
210                     -1, 0, 1,
211                     -M_PI*100, M_PI*100,
212                     (-M_PI+(M_PI/6))*100.0,
213                     -M_PI-(M_PI/6), M_PI+(M_PI/6),
214                     -M_PI/6, M_PI/6,
215                     0.1250000,
216     };
217     double rs;
218     double rc;
219 //printf("THRESHOLD=%f\n", THRESHOLD);
220 #ifdef  SINCOS
221     double _Complex ri;
222 
223     for (int i = 0 ; i < sizeof args / sizeof *args; ++i) {
224     a = args[i];
225     printf("%f\n", a);
226     ri = FCN_NAME(a);
227     printf("sincos:sin=%f %f %f\n", creal(ri), sin(a), creal(ri)-sin(a));
228     printf("sincos:cos=%f %f %f\n", cimag(ri), cos(a), cimag(ri)-cos(a));
229     }
230 #else
231     for (int i = 0 ; i < sizeof args / sizeof *args; ++i) {
232     a = args[i];
233     printf("%f\n", a);
234     S(rs = FCN_NAME(a);)
235     C(rc = FCN_NAME(a);)
236     S(printf("sin=%f %f %f\n", rs, sin(a), rs-sin(a));)
237     C(printf("cos=%f %f %f\n", rc, cos(a), rc-cos(a));)
238     }
239 
240 #endif
241 
242     return 0;
243 }
244 #endif
245 // vim: ts=4 expandtab
246 
247